The 5th Workshop on “What is Next in Multimodal Foundation Models?”


Counting to Four is still a Chore for VLMs
Duy Le Dinh Anh,
Patrick Irawan,
Tuan Van Vo
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Le_Dinh_Anh_2026_CVPR, author = {Le Dinh Anh, Duy and Irawan, Patrick and Van Vo, Tuan}, title = {Counting to Four is still a Chore for VLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7377-7385} }

Balanced Thinking: Improving Chain of Thought Training in Vision Language Models
Shaked Perek,
Ben wiesel,
Avihu Dekel,
Nimrod Shabtay,
Eli Schwartz
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Perek_2026_CVPR, author = {Perek, Shaked and wiesel, Ben and Dekel, Avihu and Shabtay, Nimrod and Schwartz, Eli}, title = {Balanced Thinking: Improving Chain of Thought Training in Vision Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7425-7434} }

Visual Geometry Grounded Novel-View Acoustic Synthesis
Jay Polra,
Dhwanil Chauhan,
Wenjun Huang,
Kyle Toth,
Xianhui Wang,
Yang Ni
[pdf]
[bibtex]
@InProceedings{Polra_2026_CVPR, author = {Polra, Jay and Chauhan, Dhwanil and Huang, Wenjun and Toth, Kyle and Wang, Xianhui and Ni, Yang}, title = {Visual Geometry Grounded Novel-View Acoustic Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7435-7444} }

VAREX: A Benchmark for Multi-Modal Structured Extraction from Documents
Udi Barzelay,
Ophir Azulai,
Inbar Shapira,
Idan Friedman,
Foad Abo Dahood,
Madison Lee,
Abraham Daniels
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Barzelay_2026_CVPR, author = {Barzelay, Udi and Azulai, Ophir and Shapira, Inbar and Friedman, Idan and Dahood, Foad Abo and Lee, Madison and Daniels, Abraham}, title = {VAREX: A Benchmark for Multi-Modal Structured Extraction from Documents}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7368-7376} }

TRACE: Training-Free Partial Audio Deepfake Detection via Embedding Trajectory Analysis of Speech Foundation Models
Awais khan,
Muhammad Umar Farooq,
Kutub Uddin,
Khalid Malik
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{khan_2026_CVPR, author = {khan, Awais and Farooq, Muhammad Umar and Uddin, Kutub and Malik, Khalid}, title = {TRACE: Training-Free Partial Audio Deepfake Detection via Embedding Trajectory Analysis of Speech Foundation Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7405-7414} }

Do Multimodal Foundation Models Truly Generalize? Exposing Failure Modes Across Perception, Reasoning, and Action
Pranav Guruprasad,
Sudipta Chowdhury,
Harsh Sikka,
Mridul Sharma,
Hong Lu,
Sean Rivera,
Hangliang Ren,
Aryan Khurana,
Yangyue Wang
[pdf] [supp]
[bibtex]
@InProceedings{Guruprasad_2026_CVPR, author = {Guruprasad, Pranav and Chowdhury, Sudipta and Sikka, Harsh and Sharma, Mridul and Lu, Hong and Rivera, Sean and Ren, Hangliang and Khurana, Aryan and Wang, Yangyue}, title = {Do Multimodal Foundation Models Truly Generalize? Exposing Failure Modes Across Perception, Reasoning, and Action}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7386-7396} }

Hierarchical Pre-Training of Vision Encoders with Large Language Model
Eugene Lee,
Ting-Yu Chang,
Jui-Huang Tsai,
Jiajie Diao,
Chen-Yi Lee
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lee_2026_CVPR, author = {Lee, Eugene and Chang, Ting-Yu and Tsai, Jui-Huang and Diao, Jiajie and Lee, Chen-Yi}, title = {Hierarchical Pre-Training of Vision Encoders with Large Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7415-7424} }

Fine-tuned Hyperbolic CLIP Models are Good Video Learners
Àlex Pujol Vidal,
Sergio Escalera,
Kamal Nasrollahi,
Thomas B. Moeslund
[pdf] [supp]
[bibtex]
@InProceedings{Vidal_2026_CVPR, author = {Vidal, \`Alex Pujol and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B.}, title = {Fine-tuned Hyperbolic CLIP Models are Good Video Learners}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7445-7453} }

VisTherapy: Text-to-Image Generation of Personalized Therapeutic Visual Stimuli
Abdul Muhsin Hameed,
Taha Razzaq,
Asim Iqbal
[pdf]
[bibtex]
@InProceedings{Hameed_2026_CVPR, author = {Hameed, Abdul Muhsin and Razzaq, Taha and Iqbal, Asim}, title = {VisTherapy: Text-to-Image Generation of Personalized Therapeutic Visual Stimuli}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7397-7404} }