The 3rd Workshop on What is Next in Multimodal Foundation Models?
How Good is my Video-LMM? Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Khattak_2025_CVPR, author = {Khattak, Muhammad Uzair and Naeem, Muhammad Ferjad and Hassan, Jameel and Naseer, Muzammal and Tombari, Federico and Khan, Fahad Shahbaz and Khan, Salman}, title = {How Good is my Video-LMM? Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3642-3651} }
PLVM: A tuning-free approach for Personalized Large Vision-Language Model-
[pdf]
[bibtex]@InProceedings{Pham_2025_CVPR, author = {Pham, Chau and Phan, Hoang and Doermann, David and Tian, Yunjie}, title = {PLVM: A tuning-free approach for Personalized Large Vision-Language Model}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3632-3641} }
Repurposing SAM for User-Defined Semantics Aware Segmentation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Kundu_2025_CVPR, author = {Kundu, Rohit and Paul, Sudipta and Dutta, Arindam and Roy-Chowdhury, Amit}, title = {Repurposing SAM for User-Defined Semantics Aware Segmentation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3621-3631} }
An Interactive Agent Foundation Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Durante_2025_CVPR, author = {Durante, Zane and Gong, Ran and Sarkar, Bidipta and Wake, Naoki and Taori, Rohan and Tang, Paul and Lakshmikanth, Shrinidhi and Schulman, Kevin and Milstein, Arnold and Vo, Hoi and Adeli, Ehsan and Terzopoulos, Demetri and Fei-Fei, Li and Gao, Jianfeng}, title = {An Interactive Agent Foundation Model}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3652-3662} }
Understanding Depth and Height Perception in Large Visual-Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Azad_2025_CVPR, author = {Azad, Shehreen and Jain, Yash and Garg, Rishit and Vineet, Vibhav and Rawat, Yogesh}, title = {Understanding Depth and Height Perception in Large Visual-Language Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3611-3620} }
Understanding the Effect of using Semantically Meaningful Tokens for Visual Representation Learning-
[pdf]
[arXiv]
[bibtex]@InProceedings{Kalibhat_2025_CVPR, author = {Kalibhat, Neha and Kattakinda, Priyatham and Nawathe, Sumit and Zarei, Arman and Seleznev, Nikita and Sharpe, Samuel and Kumar, Senthil and Feizi, Soheil}, title = {Understanding the Effect of using Semantically Meaningful Tokens for Visual Representation Learning}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3663-3672} }
UniToken: Harmonizing Multimodal Understanding and Generation through Unified Visual Encoding-
[pdf]
[arXiv]
[bibtex]@InProceedings{Jiao_2025_CVPR, author = {Jiao, Yang and Qiu, Haibo and Jie, Zequn and Chen, Shaoxiang and Chen, Jingjing and Ma, Lin and Jiang, Yu-Gang}, title = {UniToken: Harmonizing Multimodal Understanding and Generation through Unified Visual Encoding}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3600-3610} }