What is Next in Multimodal Foundation Models?
Mitigating Language Confusion for Multimodal Foundation Models via Confusion-Aware Preference Optimization Pipeline-
[pdf]
[bibtex]@InProceedings{Hwang_2025_ICCV, author = {Hwang, Seunghyun and Lim, Sungjun and Shin, Soyeon and Kim, Hyun-Geun and Lim, Jungwon and Kim, Juncheol and Kang, Byungseok and Myoung, Daewoo}, title = {Mitigating Language Confusion for Multimodal Foundation Models via Confusion-Aware Preference Optimization Pipeline}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4165-4174} }
Infusing fine-grained visual knowledge to Vision-Language Models-
[pdf]
[arXiv]
[bibtex]@InProceedings{Ypsilantis_2025_ICCV, author = {Ypsilantis, Nikolaos-Antonios and Chen, Kaifeng and Araujo, Andre and Chum, Ondrej}, title = {Infusing fine-grained visual knowledge to Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4226-4235} }
CoT-Pose: Chain-of-Thought Reasoning for 3D Pose Generation from Abstract Prompts-
[pdf]
[bibtex]@InProceedings{Cha_2025_ICCV, author = {Cha, Junuk and Kim, Jihyeon}, title = {CoT-Pose: Chain-of-Thought Reasoning for 3D Pose Generation from Abstract Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4301-4309} }
Enhancing Circuit Diagram Understanding via Near Sight Correction Using VLMs-
[pdf]
[bibtex]@InProceedings{Kulkarni_2025_ICCV, author = {Kulkarni, Shreyas and Kumar, Vivek and Minz, Remish Leonard and Varshney, Munender and Samon, Thiruvengadam and Mitra, Abhishek and Kulkarni, Nikhil and Chakravortty, Nilanjan and Mital, Prateek and Banerjee, Kingshuk}, title = {Enhancing Circuit Diagram Understanding via Near Sight Correction Using VLMs}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4175-4183} }
CobraVPS: Code Template Optimization for Better Question Reasoning Accuracy with Visual Program Synthesis-
[pdf]
[supp]
[bibtex]@InProceedings{Chen_2025_ICCV, author = {Chen, Jiajing and Zhang, Xiu and Li, Yang and Zhang, Renyu and Dong, Yujie and Velipasalar, Senem and Zhang, Jing}, title = {CobraVPS: Code Template Optimization for Better Question Reasoning Accuracy with Visual Program Synthesis}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4331-4340} }
What Holds Back Open-Vocabulary Segmentation?-
[pdf]
[bibtex]@InProceedings{Saric_2025_ICCV, author = {\v{S}ari\'c, Josip and Martinovi\'c, Ivan and Kristan, Matej and \v{S}egvi\'c, Sini\v{s}a}, title = {What Holds Back Open-Vocabulary Segmentation?}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4256-4266} }
Low-Rank Prompt Adaptation for Open-Vocabulary Object Detection-
[pdf]
[supp]
[bibtex]@InProceedings{Zhang_2025_ICCV, author = {Zhang, Zekun and Truong, Vu Quang and Hoai, Minh}, title = {Low-Rank Prompt Adaptation for Open-Vocabulary Object Detection}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4204-4215} }
Evaluating Variance in Visual Question Answering Benchmarks-
[pdf]
[arXiv]
[bibtex]@InProceedings{Nikitha_2025_ICCV, author = {Nikitha, SR}, title = {Evaluating Variance in Visual Question Answering Benchmarks}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4194-4203} }
Audio-Visual LLM for Video Understanding-
[pdf]
[arXiv]
[bibtex]@InProceedings{Shu_2025_ICCV, author = {Shu, Fangxun and Zhang, Lei and Jiang, Hao and Xie, Cihang}, title = {Audio-Visual LLM for Video Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4246-4255} }
Vocabulary-free Fine-grained Visual Recognition via Enriched Contextually Grounded Vision-Language Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Demidov_2025_ICCV, author = {Demidov, Dmitry and Zaheer, Muhammad Zaigham and Thawakar, Omkar and Khan, Salman and Khan, Fahad Shahbaz}, title = {Vocabulary-free Fine-grained Visual Recognition via Enriched Contextually Grounded Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4216-4225} }
GLAD: Generalizable Tuning for Vision-Language Models-
[pdf]
[arXiv]
[bibtex]@InProceedings{Peng_2025_ICCV, author = {Peng, Yuqi and Wang, Pengfei and Liu, Jianzhuang and Chen, Shifeng}, title = {GLAD: Generalizable Tuning for Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4310-4320} }
Strefer: Empowering Video LLMs with Space-Time Referring and Reasoning via Synthetic Instruction Data-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2025_ICCV, author = {Zhou, Honglu and Peng, Xiangyu and Kendre, Shrikant and Ryoo, Michael S and Savarese, Silvio and Xiong, Caiming and Niebles, Juan Carlos}, title = {Strefer: Empowering Video LLMs with Space-Time Referring and Reasoning via Synthetic Instruction Data}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4289-4300} }
MORFI: Mutimodal Zero-Shot Reasoning for Financial Time-Series Inference-
[pdf]
[supp]
[bibtex]@InProceedings{Khezresmaeilzadeh_2025_ICCV, author = {Khezresmaeilzadeh, Tina and Razmara, Parsa and Sadeghi, Mohammad Erfan and Azizi, Seyedarmin and Potraghloo, Erfan Baghaei}, title = {MORFI: Mutimodal Zero-Shot Reasoning for Financial Time-Series Inference}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4236-4245} }
Uncertainty-Aware ControlNet: Bridging Domain Gaps with Synthetic Image Generation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Niemeijer_2025_ICCV, author = {Niemeijer, Joshua and Ehrhardt, Jan and Handels, Heinz and Uzunova, Hristina}, title = {Uncertainty-Aware ControlNet: Bridging Domain Gaps with Synthetic Image Generation}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4184-4193} }
TULIP: Contrastive Image-Text Learning With Richer Vision Understanding-
[pdf]
[supp]
[bibtex]@InProceedings{Tang_2025_ICCV, author = {Tang, Zineng and Lian, Long and Eisape, Seun and Wang, Xudong and Herzig, Roei and Yala, Adam and Suhr, Alane and Darrell, Trevor and Chan, David M.}, title = {TULIP: Contrastive Image-Text Learning With Richer Vision Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4267-4277} }
Learning by Taking Notes: Memory-Guided Continual Learning for Generative Multimodal Models-
[pdf]
[bibtex]@InProceedings{Guo_2025_ICCV, author = {Guo, Yanhui and Guo, Chenghuan and Gao, Yan and Sun, Yi}, title = {Learning by Taking Notes: Memory-Guided Continual Learning for Generative Multimodal Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4341-4351} }
A Plug-and-Play Approach for Robust Image Editing in Text-to-Image Diffusion Models-
[pdf]
[supp]
[bibtex]@InProceedings{Jo_2025_ICCV, author = {Jo, Hyunwook and Maeng, Jiseung and Park, Jun Hyung and Ahn, Namhyuk and Park, In Kyu}, title = {A Plug-and-Play Approach for Robust Image Editing in Text-to-Image Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4321-4330} }
LLaVA-MORE: A Comparative Study of LLMs and Visual Backbones for Enhanced Visual Instruction Tuning-
[pdf]
[bibtex]@InProceedings{Cocchi_2025_ICCV, author = {Cocchi, Federico and Moratelli, Nicholas and Caffagni, Davide and Sarto, Sara and Baraldi, Lorenzo and Cornia, Marcella and Cucchiara, Rita}, title = {LLaVA-MORE: A Comparative Study of LLMs and Visual Backbones for Enhanced Visual Instruction Tuning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4278-4288} }