CVPR 2025 Open Access Repository

The 3rd Workshop on What is Next in Multimodal Foundation Models?

PLVM: A tuning-free approach for Personalized Large Vision-Language Model: Chau Pham,

Hoang Phan,

David Doermann,

Yunjie Tian; [pdf]
[bibtex]
@InProceedings{Pham_2025_CVPR, author = {Pham, Chau and Phan, Hoang and Doermann, David and Tian, Yunjie}, title = {PLVM: A tuning-free approach for Personalized Large Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3671-3680} }
Repurposing SAM for User-Defined Semantics Aware Segmentation: Rohit Kundu,

Sudipta Paul,

Arindam Dutta,

Amit Roy-Chowdhury; [pdf] [arXiv]
[bibtex]
@InProceedings{Kundu_2025_CVPR, author = {Kundu, Rohit and Paul, Sudipta and Dutta, Arindam and Roy-Chowdhury, Amit}, title = {Repurposing SAM for User-Defined Semantics Aware Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3660-3670} }
Understanding Depth and Height Perception in Large Visual-Language Models: Shehreen Azad,

Yash Jain,

Rishit Garg,

Vibhav Vineet,

Yogesh Rawat; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Azad_2025_CVPR, author = {Azad, Shehreen and Jain, Yash and Garg, Rishit and Vineet, Vibhav and Rawat, Yogesh}, title = {Understanding Depth and Height Perception in Large Visual-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3650-3659} }
Understanding the Effect of using Semantically Meaningful Tokens for Visual Representation Learning: Neha Kalibhat,

Priyatham Kattakinda,

Sumit Nawathe,

Arman Zarei,

Nikita Seleznev,

Samuel Sharpe,

Senthil Kumar,

Soheil Feizi; [pdf] [arXiv]
[bibtex]
@InProceedings{Kalibhat_2025_CVPR, author = {Kalibhat, Neha and Kattakinda, Priyatham and Nawathe, Sumit and Zarei, Arman and Seleznev, Nikita and Sharpe, Samuel and Kumar, Senthil and Feizi, Soheil}, title = {Understanding the Effect of using Semantically Meaningful Tokens for Visual Representation Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3702-3711} }
How Good is my Video-LMM? Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs: Muhammad Uzair Khattak,

Muhammad Ferjad Naeem,

Jameel Hassan,

Muzammal Naseer,

Federico Tombari,

Fahad Shahbaz Khan,

Salman Khan; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Khattak_2025_CVPR, author = {Khattak, Muhammad Uzair and Naeem, Muhammad Ferjad and Hassan, Jameel and Naseer, Muzammal and Tombari, Federico and Khan, Fahad Shahbaz and Khan, Salman}, title = {How Good is my Video-LMM? Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3681-3690} }
UniToken: Harmonizing Multimodal Understanding and Generation through Unified Visual Encoding: Yang Jiao,

Haibo Qiu,

Zequn Jie,

Shaoxiang Chen,

Jingjing Chen,

Lin Ma,

Yu-Gang Jiang; [pdf] [arXiv]
[bibtex]
@InProceedings{Jiao_2025_CVPR, author = {Jiao, Yang and Qiu, Haibo and Jie, Zequn and Chen, Shaoxiang and Chen, Jingjing and Ma, Lin and Jiang, Yu-Gang}, title = {UniToken: Harmonizing Multimodal Understanding and Generation through Unified Visual Encoding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3639-3649} }
An Interactive Agent Foundation Model: Zane Durante,

Ran Gong,

Bidipta Sarkar,

Naoki Wake,

Rohan Taori,

Paul Tang,

Shrinidhi Lakshmikanth,

Kevin Schulman,

Arnold Milstein,

Hoi Vo,

Ehsan Adeli,

Demetri Terzopoulos,

Li Fei-Fei,

Jianfeng Gao; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Durante_2025_CVPR, author = {Durante, Zane and Gong, Ran and Sarkar, Bidipta and Wake, Naoki and Taori, Rohan and Tang, Paul and Lakshmikanth, Shrinidhi and Schulman, Kevin and Milstein, Arnold and Vo, Hoi and Adeli, Ehsan and Terzopoulos, Demetri and Fei-Fei, Li and Gao, Jianfeng}, title = {An Interactive Agent Foundation Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {3691-3701} }