CVPR 2026 Open Access Repository

The 5th Workshop on “What is Next in Multimodal Foundation Models?”

Counting to Four is still a Chore for VLMs: Duy Le Dinh Anh,

Patrick Irawan,

Tuan Van Vo; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Le_Dinh_Anh_2026_CVPR, author = {Le Dinh Anh, Duy and Irawan, Patrick and Van Vo, Tuan}, title = {Counting to Four is still a Chore for VLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7377-7385} }
Balanced Thinking: Improving Chain of Thought Training in Vision Language Models: Shaked Perek,

Ben wiesel,

Avihu Dekel,

Nimrod Shabtay,

Eli Schwartz; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Perek_2026_CVPR, author = {Perek, Shaked and wiesel, Ben and Dekel, Avihu and Shabtay, Nimrod and Schwartz, Eli}, title = {Balanced Thinking: Improving Chain of Thought Training in Vision Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7425-7434} }
Visual Geometry Grounded Novel-View Acoustic Synthesis: Jay Polra,

Dhwanil Chauhan,

Wenjun Huang,

Kyle Toth,

Xianhui Wang,

Yang Ni; [pdf]
[bibtex]
@InProceedings{Polra_2026_CVPR, author = {Polra, Jay and Chauhan, Dhwanil and Huang, Wenjun and Toth, Kyle and Wang, Xianhui and Ni, Yang}, title = {Visual Geometry Grounded Novel-View Acoustic Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7435-7444} }
VAREX: A Benchmark for Multi-Modal Structured Extraction from Documents: Udi Barzelay,

Ophir Azulai,

Inbar Shapira,

Idan Friedman,

Foad Abo Dahood,

Madison Lee,

Abraham Daniels; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Barzelay_2026_CVPR, author = {Barzelay, Udi and Azulai, Ophir and Shapira, Inbar and Friedman, Idan and Dahood, Foad Abo and Lee, Madison and Daniels, Abraham}, title = {VAREX: A Benchmark for Multi-Modal Structured Extraction from Documents}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7368-7376} }
TRACE: Training-Free Partial Audio Deepfake Detection via Embedding Trajectory Analysis of Speech Foundation Models: Awais khan,

Muhammad Umar Farooq,

Kutub Uddin,

Khalid Malik; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{khan_2026_CVPR, author = {khan, Awais and Farooq, Muhammad Umar and Uddin, Kutub and Malik, Khalid}, title = {TRACE: Training-Free Partial Audio Deepfake Detection via Embedding Trajectory Analysis of Speech Foundation Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7405-7414} }
Do Multimodal Foundation Models Truly Generalize? Exposing Failure Modes Across Perception, Reasoning, and Action: Pranav Guruprasad,

Sudipta Chowdhury,

Harsh Sikka,

Mridul Sharma,

Hong Lu,

Sean Rivera,

Hangliang Ren,

Aryan Khurana,

Yangyue Wang; [pdf] [supp]
[bibtex]
@InProceedings{Guruprasad_2026_CVPR, author = {Guruprasad, Pranav and Chowdhury, Sudipta and Sikka, Harsh and Sharma, Mridul and Lu, Hong and Rivera, Sean and Ren, Hangliang and Khurana, Aryan and Wang, Yangyue}, title = {Do Multimodal Foundation Models Truly Generalize? Exposing Failure Modes Across Perception, Reasoning, and Action}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7386-7396} }
Hierarchical Pre-Training of Vision Encoders with Large Language Model: Eugene Lee,

Ting-Yu Chang,

Jui-Huang Tsai,

Jiajie Diao,

Chen-Yi Lee; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lee_2026_CVPR, author = {Lee, Eugene and Chang, Ting-Yu and Tsai, Jui-Huang and Diao, Jiajie and Lee, Chen-Yi}, title = {Hierarchical Pre-Training of Vision Encoders with Large Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7415-7424} }
Fine-tuned Hyperbolic CLIP Models are Good Video Learners: Àlex Pujol Vidal,

Sergio Escalera,

Kamal Nasrollahi,

Thomas B. Moeslund; [pdf] [supp]
[bibtex]
@InProceedings{Vidal_2026_CVPR, author = {Vidal, \`Alex Pujol and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B.}, title = {Fine-tuned Hyperbolic CLIP Models are Good Video Learners}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7445-7453} }
VisTherapy: Text-to-Image Generation of Personalized Therapeutic Visual Stimuli: Abdul Muhsin Hameed,

Taha Razzaq,

Asim Iqbal; [pdf]
[bibtex]
@InProceedings{Hameed_2026_CVPR, author = {Hameed, Abdul Muhsin and Razzaq, Taha and Iqbal, Asim}, title = {VisTherapy: Text-to-Image Generation of Personalized Therapeutic Visual Stimuli}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7397-7404} }