ICCV 2025 Open Access Repository

6th Workshop on Closing the Loop Between Vision and Language (Decade Mark)

Context-Aware Image Caption Editing via Hallucination-Resistant Visual Instruction Tuning: Yoonhyung Kim,

Byung Ok Kang,

Hwa Jeon Song; [pdf]
[bibtex]
@InProceedings{Kim_2025_ICCV, author = {Kim, Yoonhyung and Kang, Byung Ok and Song, Hwa Jeon}, title = {Context-Aware Image Caption Editing via Hallucination-Resistant Visual Instruction Tuning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5901-5911} }
STORM: Token-Efficient Long Video Understanding for Multimodal LLMs: Jindong Jiang,

Xiuyu Li,

Zhijian Liu,

Muyang Li,

Guo Chen,

Zhiqi Li,

De-An Huang,

Guilin Liu,

Zhiding Yu,

Kurt Keutzer,

Sungjin Ahn,

Jan Kautz,

Hongxu Yin,

Yao Lu,

Song Han,

Wonmin Byeon; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Jiang_2025_ICCV, author = {Jiang, Jindong and Li, Xiuyu and Liu, Zhijian and Li, Muyang and Chen, Guo and Li, Zhiqi and Huang, De-An and Liu, Guilin and Yu, Zhiding and Keutzer, Kurt and Ahn, Sungjin and Kautz, Jan and Yin, Hongxu and Lu, Yao and Han, Song and Byeon, Wonmin}, title = {STORM: Token-Efficient Long Video Understanding for Multimodal LLMs}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5889-5900} }
PLOT-TAL: Prompt-Learning with Optimal Transport for Few-Shot Temporal Action Localization: Edward Fish,

Andrew Gilbert; [pdf] [arXiv]
[bibtex]
@InProceedings{Fish_2025_ICCV, author = {Fish, Edward and Gilbert, Andrew}, title = {PLOT-TAL: Prompt-Learning with Optimal Transport for Few-Shot Temporal Action Localization}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5912-5921} }
AutoConcept: Unsupervised Extraction of Constituent Concepts from Single Image: Pranav Singh Chib,

Kirtankumar Vijaykumar Patel,

Mudit Gupta,

Pise Ashutosh Kalidas,

Pravendra Singh; [pdf] [supp]
[bibtex]
@InProceedings{Chib_2025_ICCV, author = {Chib, Pranav Singh and Patel, Kirtankumar Vijaykumar and Gupta, Mudit and Kalidas, Pise Ashutosh and Singh, Pravendra}, title = {AutoConcept: Unsupervised Extraction of Constituent Concepts from Single Image}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5922-5932} }
Noise is an Efficient Learner for Zero-Shot Vision-Language Models: Raza Imam,

Asif Hanif,

Jian Zhang,

Khaled Waleed Dawoud,

Yova Kementchedjhieva,

Mohammad Yaqub; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Imam_2025_ICCV, author = {Imam, Raza and Hanif, Asif and Zhang, Jian and Dawoud, Khaled Waleed and Kementchedjhieva, Yova and Yaqub, Mohammad}, title = {Noise is an Efficient Learner for Zero-Shot Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5879-5888} }