The 5th Workshop on “What is Next in Multimodal Foundation Models?”
Counting to Four is still a Chore for VLMs-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Le_Dinh_Anh_2026_CVPR, author = {Le Dinh Anh, Duy and Irawan, Patrick and Van Vo, Tuan}, title = {Counting to Four is still a Chore for VLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7377-7385} }
Balanced Thinking: Improving Chain of Thought Training in Vision Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Perek_2026_CVPR, author = {Perek, Shaked and wiesel, Ben and Dekel, Avihu and Shabtay, Nimrod and Schwartz, Eli}, title = {Balanced Thinking: Improving Chain of Thought Training in Vision Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7425-7434} }
Visual Geometry Grounded Novel-View Acoustic Synthesis-
[pdf]
[bibtex]@InProceedings{Polra_2026_CVPR, author = {Polra, Jay and Chauhan, Dhwanil and Huang, Wenjun and Toth, Kyle and Wang, Xianhui and Ni, Yang}, title = {Visual Geometry Grounded Novel-View Acoustic Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7435-7444} }
VAREX: A Benchmark for Multi-Modal Structured Extraction from Documents-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Barzelay_2026_CVPR, author = {Barzelay, Udi and Azulai, Ophir and Shapira, Inbar and Friedman, Idan and Dahood, Foad Abo and Lee, Madison and Daniels, Abraham}, title = {VAREX: A Benchmark for Multi-Modal Structured Extraction from Documents}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7368-7376} }
TRACE: Training-Free Partial Audio Deepfake Detection via Embedding Trajectory Analysis of Speech Foundation Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{khan_2026_CVPR, author = {khan, Awais and Farooq, Muhammad Umar and Uddin, Kutub and Malik, Khalid}, title = {TRACE: Training-Free Partial Audio Deepfake Detection via Embedding Trajectory Analysis of Speech Foundation Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7405-7414} }
Do Multimodal Foundation Models Truly Generalize? Exposing Failure Modes Across Perception, Reasoning, and Action-
[pdf]
[supp]
[bibtex]@InProceedings{Guruprasad_2026_CVPR, author = {Guruprasad, Pranav and Chowdhury, Sudipta and Sikka, Harsh and Sharma, Mridul and Lu, Hong and Rivera, Sean and Ren, Hangliang and Khurana, Aryan and Wang, Yangyue}, title = {Do Multimodal Foundation Models Truly Generalize? Exposing Failure Modes Across Perception, Reasoning, and Action}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7386-7396} }
Hierarchical Pre-Training of Vision Encoders with Large Language Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lee_2026_CVPR, author = {Lee, Eugene and Chang, Ting-Yu and Tsai, Jui-Huang and Diao, Jiajie and Lee, Chen-Yi}, title = {Hierarchical Pre-Training of Vision Encoders with Large Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7415-7424} }
Fine-tuned Hyperbolic CLIP Models are Good Video Learners-
[pdf]
[supp]
[bibtex]@InProceedings{Vidal_2026_CVPR, author = {Vidal, \`Alex Pujol and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B.}, title = {Fine-tuned Hyperbolic CLIP Models are Good Video Learners}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7445-7453} }
VisTherapy: Text-to-Image Generation of Personalized Therapeutic Visual Stimuli-
[pdf]
[bibtex]@InProceedings{Hameed_2026_CVPR, author = {Hameed, Abdul Muhsin and Razzaq, Taha and Iqbal, Asim}, title = {VisTherapy: Text-to-Image Generation of Personalized Therapeutic Visual Stimuli}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7397-7404} }

