The 3rd Workshop on What is Next in Multimodal Foundation Models?


How Good is my Video-LMM? Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs
Muhammad Uzair Khattak,
Muhammad Ferjad Naeem,
Jameel Hassan,
Muzammal Naseer,
Federico Tombari,
Fahad Shahbaz Khan,
Salman Khan
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Khattak_2025_CVPR, author = {Khattak, Muhammad Uzair and Naeem, Muhammad Ferjad and Hassan, Jameel and Naseer, Muzammal and Tombari, Federico and Khan, Fahad Shahbaz and Khan, Salman}, title = {How Good is my Video-LMM? Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3642-3651} }

PLVM: A tuning-free approach for Personalized Large Vision-Language Model
Chau Pham,
Hoang Phan,
David Doermann,
Yunjie Tian
[pdf]
[bibtex]
@InProceedings{Pham_2025_CVPR, author = {Pham, Chau and Phan, Hoang and Doermann, David and Tian, Yunjie}, title = {PLVM: A tuning-free approach for Personalized Large Vision-Language Model}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3632-3641} }

Repurposing SAM for User-Defined Semantics Aware Segmentation
Rohit Kundu,
Sudipta Paul,
Arindam Dutta,
Amit Roy-Chowdhury
[pdf] [arXiv]
[bibtex]
@InProceedings{Kundu_2025_CVPR, author = {Kundu, Rohit and Paul, Sudipta and Dutta, Arindam and Roy-Chowdhury, Amit}, title = {Repurposing SAM for User-Defined Semantics Aware Segmentation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3621-3631} }

An Interactive Agent Foundation Model
Zane Durante,
Ran Gong,
Bidipta Sarkar,
Naoki Wake,
Rohan Taori,
Paul Tang,
Shrinidhi Lakshmikanth,
Kevin Schulman,
Arnold Milstein,
Hoi Vo,
Ehsan Adeli,
Demetri Terzopoulos,
Li Fei-Fei,
Jianfeng Gao
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Durante_2025_CVPR, author = {Durante, Zane and Gong, Ran and Sarkar, Bidipta and Wake, Naoki and Taori, Rohan and Tang, Paul and Lakshmikanth, Shrinidhi and Schulman, Kevin and Milstein, Arnold and Vo, Hoi and Adeli, Ehsan and Terzopoulos, Demetri and Fei-Fei, Li and Gao, Jianfeng}, title = {An Interactive Agent Foundation Model}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3652-3662} }

Understanding Depth and Height Perception in Large Visual-Language Models
Shehreen Azad,
Yash Jain,
Rishit Garg,
Vibhav Vineet,
Yogesh Rawat
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Azad_2025_CVPR, author = {Azad, Shehreen and Jain, Yash and Garg, Rishit and Vineet, Vibhav and Rawat, Yogesh}, title = {Understanding Depth and Height Perception in Large Visual-Language Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3611-3620} }

Understanding the Effect of using Semantically Meaningful Tokens for Visual Representation Learning
Neha Kalibhat,
Priyatham Kattakinda,
Sumit Nawathe,
Arman Zarei,
Nikita Seleznev,
Samuel Sharpe,
Senthil Kumar,
Soheil Feizi
[pdf] [arXiv]
[bibtex]
@InProceedings{Kalibhat_2025_CVPR, author = {Kalibhat, Neha and Kattakinda, Priyatham and Nawathe, Sumit and Zarei, Arman and Seleznev, Nikita and Sharpe, Samuel and Kumar, Senthil and Feizi, Soheil}, title = {Understanding the Effect of using Semantically Meaningful Tokens for Visual Representation Learning}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3663-3672} }

UniToken: Harmonizing Multimodal Understanding and Generation through Unified Visual Encoding
Yang Jiao,
Haibo Qiu,
Zequn Jie,
Shaoxiang Chen,
Jingjing Chen,
Lin Ma,
Yu-Gang Jiang
[pdf] [arXiv]
[bibtex]
@InProceedings{Jiao_2025_CVPR, author = {Jiao, Yang and Qiu, Haibo and Jie, Zequn and Chen, Shaoxiang and Chen, Jingjing and Ma, Lin and Jiang, Yu-Gang}, title = {UniToken: Harmonizing Multimodal Understanding and Generation through Unified Visual Encoding}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3600-3610} }