ICCV 2025 Open Access Repository

What is Next in Multimodal Foundation Models?

Enhancing Circuit Diagram Understanding via Near Sight Correction Using VLMs: Shreyas Kulkarni,

Vivek Kumar,

Remish Leonard Minz,

Munender Varshney,

Thiruvengadam Samon,

Abhishek Mitra,

Nikhil Kulkarni,

Nilanjan Chakravortty,

Prateek Mital,

Kingshuk Banerjee; [pdf]
[bibtex]
@InProceedings{Kulkarni_2025_ICCV, author = {Kulkarni, Shreyas and Kumar, Vivek and Minz, Remish Leonard and Varshney, Munender and Samon, Thiruvengadam and Mitra, Abhishek and Kulkarni, Nikhil and Chakravortty, Nilanjan and Mital, Prateek and Banerjee, Kingshuk}, title = {Enhancing Circuit Diagram Understanding via Near Sight Correction Using VLMs}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4234-4242} }
Audio-Visual LLM for Video Understanding: Fangxun Shu,

Lei Zhang,

Hao Jiang,

Cihang Xie; [pdf] [arXiv]
[bibtex]
@InProceedings{Shu_2025_ICCV, author = {Shu, Fangxun and Zhang, Lei and Jiang, Hao and Xie, Cihang}, title = {Audio-Visual LLM for Video Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4305-4314} }
LLaVA-MORE: A Comparative Study of LLMs and Visual Backbones for Enhanced Visual Instruction Tuning: Federico Cocchi,

Nicholas Moratelli,

Davide Caffagni,

Sara Sarto,

Lorenzo Baraldi,

Marcella Cornia,

Rita Cucchiara; [pdf] [arXiv]
[bibtex]
@InProceedings{Cocchi_2025_ICCV, author = {Cocchi, Federico and Moratelli, Nicholas and Caffagni, Davide and Sarto, Sara and Baraldi, Lorenzo and Cornia, Marcella and Cucchiara, Rita}, title = {LLaVA-MORE: A Comparative Study of LLMs and Visual Backbones for Enhanced Visual Instruction Tuning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4337-4347} }
Low-Rank Prompt Adaptation for Open-Vocabulary Object Detection: Zekun Zhang,

Vu Quang Truong,

Minh Hoai; [pdf] [supp]
[bibtex]
@InProceedings{Zhang_2025_ICCV, author = {Zhang, Zekun and Truong, Vu Quang and Hoai, Minh}, title = {Low-Rank Prompt Adaptation for Open-Vocabulary Object Detection}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4263-4274} }
CobraVPS: Code Template Optimization for Better Question Reasoning Accuracy with Visual Program Synthesis: Jiajing Chen,

Xiu Zhang,

Yang Li,

Renyu Zhang,

Yujie Dong,

Senem Velipasalar,

Jing Zhang; [pdf] [supp]
[bibtex]
@InProceedings{Chen_2025_ICCV, author = {Chen, Jiajing and Zhang, Xiu and Li, Yang and Zhang, Renyu and Dong, Yujie and Velipasalar, Senem and Zhang, Jing}, title = {CobraVPS: Code Template Optimization for Better Question Reasoning Accuracy with Visual Program Synthesis}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4390-4399} }
A Plug-and-Play Approach for Robust Image Editing in Text-to-Image Diffusion Models: Hyunwook Jo,

Jiseung Maeng,

Jun Hyung Park,

Namhyuk Ahn,

In Kyu Park; [pdf] [supp]
[bibtex]
@InProceedings{Jo_2025_ICCV, author = {Jo, Hyunwook and Maeng, Jiseung and Park, Jun Hyung and Ahn, Namhyuk and Park, In Kyu}, title = {A Plug-and-Play Approach for Robust Image Editing in Text-to-Image Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4380-4389} }
Strefer: Empowering Video LLMs with Space-Time Referring and Reasoning via Synthetic Instruction Data: Honglu Zhou,

Xiangyu Peng,

Shrikant Kendre,

Michael S Ryoo,

Silvio Savarese,

Caiming Xiong,

Juan Carlos Niebles; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Zhou_2025_ICCV, author = {Zhou, Honglu and Peng, Xiangyu and Kendre, Shrikant and Ryoo, Michael S and Savarese, Silvio and Xiong, Caiming and Niebles, Juan Carlos}, title = {Strefer: Empowering Video LLMs with Space-Time Referring and Reasoning via Synthetic Instruction Data}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4348-4359} }
Uncertainty-Aware ControlNet: Bridging Domain Gaps with Synthetic Image Generation: Joshua Niemeijer,

Jan Ehrhardt,

Heinz Handels,

Hristina Uzunova; [pdf] [arXiv]
[bibtex]
@InProceedings{Niemeijer_2025_ICCV, author = {Niemeijer, Joshua and Ehrhardt, Jan and Handels, Heinz and Uzunova, Hristina}, title = {Uncertainty-Aware ControlNet: Bridging Domain Gaps with Synthetic Image Generation}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4243-4252} }
Vocabulary-free Fine-grained Visual Recognition via Enriched Contextually Grounded Vision-Language Model: Dmitry Demidov,

Muhammad Zaigham Zaheer,

Omkar Thawakar,

Salman Khan,

Fahad Shahbaz Khan; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Demidov_2025_ICCV, author = {Demidov, Dmitry and Zaheer, Muhammad Zaigham and Thawakar, Omkar and Khan, Salman and Khan, Fahad Shahbaz}, title = {Vocabulary-free Fine-grained Visual Recognition via Enriched Contextually Grounded Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4275-4284} }
Evaluating Variance in Visual Question Answering Benchmarks: Nikitha SR; [pdf] [arXiv]
[bibtex]
@InProceedings{Nikitha_2025_ICCV, author = {Nikitha, SR}, title = {Evaluating Variance in Visual Question Answering Benchmarks}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4253-4262} }
Infusing fine-grained visual knowledge to Vision-Language Models: Nikolaos-Antonios Ypsilantis,

Kaifeng Chen,

Andre Araujo,

Ondrej Chum; [pdf] [arXiv]
[bibtex]
@InProceedings{Ypsilantis_2025_ICCV, author = {Ypsilantis, Nikolaos-Antonios and Chen, Kaifeng and Araujo, Andre and Chum, Ondrej}, title = {Infusing fine-grained visual knowledge to Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4285-4294} }
Learning by Taking Notes: Memory-Guided Continual Learning for Generative Multimodal Models: Yanhui Guo,

Chenghuan Guo,

Yan Gao,

Yi Sun; [pdf]
[bibtex]
@InProceedings{Guo_2025_ICCV, author = {Guo, Yanhui and Guo, Chenghuan and Gao, Yan and Sun, Yi}, title = {Learning by Taking Notes: Memory-Guided Continual Learning for Generative Multimodal Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4400-4410} }
GLAD: Generalizable Tuning for Vision-Language Models: Yuqi Peng,

Pengfei Wang,

Jianzhuang Liu,

Shifeng Chen; [pdf] [arXiv]
[bibtex]
@InProceedings{Peng_2025_ICCV, author = {Peng, Yuqi and Wang, Pengfei and Liu, Jianzhuang and Chen, Shifeng}, title = {GLAD: Generalizable Tuning for Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4369-4379} }
Mitigating Language Confusion for Multimodal Foundation Models via Confusion-Aware Preference Optimization Pipeline: Seunghyun Hwang,

Sungjun Lim,

Soyeon Shin,

Hyun-Geun Kim,

Jungwon Lim,

Juncheol Kim,

Byungseok Kang,

Daewoo Myoung; [pdf]
[bibtex]
@InProceedings{Hwang_2025_ICCV, author = {Hwang, Seunghyun and Lim, Sungjun and Shin, Soyeon and Kim, Hyun-Geun and Lim, Jungwon and Kim, Juncheol and Kang, Byungseok and Myoung, Daewoo}, title = {Mitigating Language Confusion for Multimodal Foundation Models via Confusion-Aware Preference Optimization Pipeline}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4224-4233} }
CoT-Pose: Chain-of-Thought Reasoning for 3D Pose Generation from Abstract Prompts: Junuk Cha,

Jihyeon Kim; [pdf] [arXiv]
[bibtex]
@InProceedings{Cha_2025_ICCV, author = {Cha, Junuk and Kim, Jihyeon}, title = {CoT-Pose: Chain-of-Thought Reasoning for 3D Pose Generation from Abstract Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4360-4368} }
TULIP: Contrastive Image-Text Learning With Richer Vision Understanding: Zineng Tang,

Long Lian,

Seun Eisape,

Xudong Wang,

Roei Herzig,

Adam Yala,

Alane Suhr,

Trevor Darrell,

David M. Chan; [pdf] [supp]
[bibtex]
@InProceedings{Tang_2025_ICCV, author = {Tang, Zineng and Lian, Long and Eisape, Seun and Wang, Xudong and Herzig, Roei and Yala, Adam and Suhr, Alane and Darrell, Trevor and Chan, David M.}, title = {TULIP: Contrastive Image-Text Learning With Richer Vision Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4326-4336} }
What Holds Back Open-Vocabulary Segmentation?: Josip Šarić,

Ivan Martinović,

Matej Kristan,

Siniša Šegvić; [pdf]
[bibtex]
@InProceedings{Saric_2025_ICCV, author = {\v{S}ari\'c, Josip and Martinovi\'c, Ivan and Kristan, Matej and \v{S}egvi\'c, Sini\v{s}a}, title = {What Holds Back Open-Vocabulary Segmentation?}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4315-4325} }
MORFI: Mutimodal Zero-Shot Reasoning for Financial Time-Series Inference: Tina Khezresmaeilzadeh,

Parsa Razmara,

Mohammad Erfan Sadeghi,

Seyedarmin Azizi,

Erfan Baghaei Potraghloo; [pdf] [supp]
[bibtex]
@InProceedings{Khezresmaeilzadeh_2025_ICCV, author = {Khezresmaeilzadeh, Tina and Razmara, Parsa and Sadeghi, Mohammad Erfan and Azizi, Seyedarmin and Potraghloo, Erfan Baghaei}, title = {MORFI: Mutimodal Zero-Shot Reasoning for Financial Time-Series Inference}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {4295-4304} }