What is Next in Multimodal Foundation Models?


Mitigating Language Confusion for Multimodal Foundation Models via Confusion-Aware Preference Optimization Pipeline
Seunghyun Hwang,
Sungjun Lim,
Soyeon Shin,
Hyun-Geun Kim,
Jungwon Lim,
Juncheol Kim,
Byungseok Kang,
Daewoo Myoung
[pdf]
[bibtex]
@InProceedings{Hwang_2025_ICCV, author = {Hwang, Seunghyun and Lim, Sungjun and Shin, Soyeon and Kim, Hyun-Geun and Lim, Jungwon and Kim, Juncheol and Kang, Byungseok and Myoung, Daewoo}, title = {Mitigating Language Confusion for Multimodal Foundation Models via Confusion-Aware Preference Optimization Pipeline}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4165-4174} }

Infusing fine-grained visual knowledge to Vision-Language Models
Nikolaos-Antonios Ypsilantis,
Kaifeng Chen,
Andre Araujo,
Ondrej Chum
[pdf] [arXiv]
[bibtex]
@InProceedings{Ypsilantis_2025_ICCV, author = {Ypsilantis, Nikolaos-Antonios and Chen, Kaifeng and Araujo, Andre and Chum, Ondrej}, title = {Infusing fine-grained visual knowledge to Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4226-4235} }

CoT-Pose: Chain-of-Thought Reasoning for 3D Pose Generation from Abstract Prompts
Junuk Cha,
Jihyeon Kim
[pdf]
[bibtex]
@InProceedings{Cha_2025_ICCV, author = {Cha, Junuk and Kim, Jihyeon}, title = {CoT-Pose: Chain-of-Thought Reasoning for 3D Pose Generation from Abstract Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4301-4309} }

Enhancing Circuit Diagram Understanding via Near Sight Correction Using VLMs
Shreyas Kulkarni,
Vivek Kumar,
Remish Leonard Minz,
Munender Varshney,
Thiruvengadam Samon,
Abhishek Mitra,
Nikhil Kulkarni,
Nilanjan Chakravortty,
Prateek Mital,
Kingshuk Banerjee
[pdf]
[bibtex]
@InProceedings{Kulkarni_2025_ICCV, author = {Kulkarni, Shreyas and Kumar, Vivek and Minz, Remish Leonard and Varshney, Munender and Samon, Thiruvengadam and Mitra, Abhishek and Kulkarni, Nikhil and Chakravortty, Nilanjan and Mital, Prateek and Banerjee, Kingshuk}, title = {Enhancing Circuit Diagram Understanding via Near Sight Correction Using VLMs}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4175-4183} }

CobraVPS: Code Template Optimization for Better Question Reasoning Accuracy with Visual Program Synthesis
Jiajing Chen,
Xiu Zhang,
Yang Li,
Renyu Zhang,
Yujie Dong,
Senem Velipasalar,
Jing Zhang
[pdf] [supp]
[bibtex]
@InProceedings{Chen_2025_ICCV, author = {Chen, Jiajing and Zhang, Xiu and Li, Yang and Zhang, Renyu and Dong, Yujie and Velipasalar, Senem and Zhang, Jing}, title = {CobraVPS: Code Template Optimization for Better Question Reasoning Accuracy with Visual Program Synthesis}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4331-4340} }

What Holds Back Open-Vocabulary Segmentation?
Josip Šarić,
Ivan Martinović,
Matej Kristan,
Siniša Šegvić
[pdf]
[bibtex]
@InProceedings{Saric_2025_ICCV, author = {\v{S}ari\'c, Josip and Martinovi\'c, Ivan and Kristan, Matej and \v{S}egvi\'c, Sini\v{s}a}, title = {What Holds Back Open-Vocabulary Segmentation?}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4256-4266} }

Low-Rank Prompt Adaptation for Open-Vocabulary Object Detection
Zekun Zhang,
Vu Quang Truong,
Minh Hoai
[pdf] [supp]
[bibtex]
@InProceedings{Zhang_2025_ICCV, author = {Zhang, Zekun and Truong, Vu Quang and Hoai, Minh}, title = {Low-Rank Prompt Adaptation for Open-Vocabulary Object Detection}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4204-4215} }

Evaluating Variance in Visual Question Answering Benchmarks
Nikitha SR
[pdf] [arXiv]
[bibtex]
@InProceedings{Nikitha_2025_ICCV, author = {Nikitha, SR}, title = {Evaluating Variance in Visual Question Answering Benchmarks}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4194-4203} }

Audio-Visual LLM for Video Understanding
Fangxun Shu,
Lei Zhang,
Hao Jiang,
Cihang Xie
[pdf] [arXiv]
[bibtex]
@InProceedings{Shu_2025_ICCV, author = {Shu, Fangxun and Zhang, Lei and Jiang, Hao and Xie, Cihang}, title = {Audio-Visual LLM for Video Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4246-4255} }

Vocabulary-free Fine-grained Visual Recognition via Enriched Contextually Grounded Vision-Language Model
Dmitry Demidov,
Muhammad Zaigham Zaheer,
Omkar Thawakar,
Salman Khan,
Fahad Shahbaz Khan
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Demidov_2025_ICCV, author = {Demidov, Dmitry and Zaheer, Muhammad Zaigham and Thawakar, Omkar and Khan, Salman and Khan, Fahad Shahbaz}, title = {Vocabulary-free Fine-grained Visual Recognition via Enriched Contextually Grounded Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4216-4225} }

GLAD: Generalizable Tuning for Vision-Language Models
Yuqi Peng,
Pengfei Wang,
Jianzhuang Liu,
Shifeng Chen
[pdf] [arXiv]
[bibtex]
@InProceedings{Peng_2025_ICCV, author = {Peng, Yuqi and Wang, Pengfei and Liu, Jianzhuang and Chen, Shifeng}, title = {GLAD: Generalizable Tuning for Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4310-4320} }

Strefer: Empowering Video LLMs with Space-Time Referring and Reasoning via Synthetic Instruction Data
Honglu Zhou,
Xiangyu Peng,
Shrikant Kendre,
Michael S Ryoo,
Silvio Savarese,
Caiming Xiong,
Juan Carlos Niebles
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Zhou_2025_ICCV, author = {Zhou, Honglu and Peng, Xiangyu and Kendre, Shrikant and Ryoo, Michael S and Savarese, Silvio and Xiong, Caiming and Niebles, Juan Carlos}, title = {Strefer: Empowering Video LLMs with Space-Time Referring and Reasoning via Synthetic Instruction Data}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4289-4300} }

MORFI: Mutimodal Zero-Shot Reasoning for Financial Time-Series Inference
Tina Khezresmaeilzadeh,
Parsa Razmara,
Mohammad Erfan Sadeghi,
Seyedarmin Azizi,
Erfan Baghaei Potraghloo
[pdf] [supp]
[bibtex]
@InProceedings{Khezresmaeilzadeh_2025_ICCV, author = {Khezresmaeilzadeh, Tina and Razmara, Parsa and Sadeghi, Mohammad Erfan and Azizi, Seyedarmin and Potraghloo, Erfan Baghaei}, title = {MORFI: Mutimodal Zero-Shot Reasoning for Financial Time-Series Inference}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4236-4245} }

Uncertainty-Aware ControlNet: Bridging Domain Gaps with Synthetic Image Generation
Joshua Niemeijer,
Jan Ehrhardt,
Heinz Handels,
Hristina Uzunova
[pdf] [arXiv]
[bibtex]
@InProceedings{Niemeijer_2025_ICCV, author = {Niemeijer, Joshua and Ehrhardt, Jan and Handels, Heinz and Uzunova, Hristina}, title = {Uncertainty-Aware ControlNet: Bridging Domain Gaps with Synthetic Image Generation}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4184-4193} }

TULIP: Contrastive Image-Text Learning With Richer Vision Understanding
Zineng Tang,
Long Lian,
Seun Eisape,
Xudong Wang,
Roei Herzig,
Adam Yala,
Alane Suhr,
Trevor Darrell,
David M. Chan
[pdf] [supp]
[bibtex]
@InProceedings{Tang_2025_ICCV, author = {Tang, Zineng and Lian, Long and Eisape, Seun and Wang, Xudong and Herzig, Roei and Yala, Adam and Suhr, Alane and Darrell, Trevor and Chan, David M.}, title = {TULIP: Contrastive Image-Text Learning With Richer Vision Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4267-4277} }

Learning by Taking Notes: Memory-Guided Continual Learning for Generative Multimodal Models
Yanhui Guo,
Chenghuan Guo,
Yan Gao,
Yi Sun
[pdf]
[bibtex]
@InProceedings{Guo_2025_ICCV, author = {Guo, Yanhui and Guo, Chenghuan and Gao, Yan and Sun, Yi}, title = {Learning by Taking Notes: Memory-Guided Continual Learning for Generative Multimodal Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4341-4351} }

A Plug-and-Play Approach for Robust Image Editing in Text-to-Image Diffusion Models
Hyunwook Jo,
Jiseung Maeng,
Jun Hyung Park,
Namhyuk Ahn,
In Kyu Park
[pdf] [supp]
[bibtex]
@InProceedings{Jo_2025_ICCV, author = {Jo, Hyunwook and Maeng, Jiseung and Park, Jun Hyung and Ahn, Namhyuk and Park, In Kyu}, title = {A Plug-and-Play Approach for Robust Image Editing in Text-to-Image Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4321-4330} }

LLaVA-MORE: A Comparative Study of LLMs and Visual Backbones for Enhanced Visual Instruction Tuning
Federico Cocchi,
Nicholas Moratelli,
Davide Caffagni,
Sara Sarto,
Lorenzo Baraldi,
Marcella Cornia,
Rita Cucchiara
[pdf]
[bibtex]
@InProceedings{Cocchi_2025_ICCV, author = {Cocchi, Federico and Moratelli, Nicholas and Caffagni, Davide and Sarto, Sara and Baraldi, Lorenzo and Cornia, Marcella and Cucchiara, Rita}, title = {LLaVA-MORE: A Comparative Study of LLMs and Visual Backbones for Enhanced Visual Instruction Tuning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4278-4288} }