6th Workshop on Closing the Loop Between Vision and Language (Decade Mark)


AutoConcept: Unsupervised Extraction of Constituent Concepts from Single Image
Pranav Singh Chib,
Kirtankumar Vijaykumar Patel,
Mudit Gupta,
Pise Ashutosh Kalidas,
Pravendra Singh
[pdf] [supp]
[bibtex]
@InProceedings{Chib_2025_ICCV, author = {Chib, Pranav Singh and Patel, Kirtankumar Vijaykumar and Gupta, Mudit and Kalidas, Pise Ashutosh and Singh, Pravendra}, title = {AutoConcept: Unsupervised Extraction of Constituent Concepts from Single Image}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5863-5873} }

STORM: Token-Efficient Long Video Understanding for Multimodal LLMs
Jindong Jiang,
Xiuyu Li,
Zhijian Liu,
Muyang Li,
Guo Chen,
Zhiqi Li,
De-An Huang,
Guilin Liu,
Zhiding Yu,
Kurt Keutzer,
Sungjin Ahn,
Jan Kautz,
Hongxu Yin,
Yao Lu,
Song Han,
Wonmin Byeon
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Jiang_2025_ICCV, author = {Jiang, Jindong and Li, Xiuyu and Liu, Zhijian and Li, Muyang and Chen, Guo and Li, Zhiqi and Huang, De-An and Liu, Guilin and Yu, Zhiding and Keutzer, Kurt and Ahn, Sungjin and Kautz, Jan and Yin, Hongxu and Lu, Yao and Han, Song and Byeon, Wonmin}, title = {STORM: Token-Efficient Long Video Understanding for Multimodal LLMs}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5830-5841} }

PLOT-TAL: Prompt-Learning with Optimal Transport for Few-Shot Temporal Action Localization
Edward Fish,
Andrew Gilbert
[pdf]
[bibtex]
@InProceedings{Fish_2025_ICCV, author = {Fish, Edward and Gilbert, Andrew}, title = {PLOT-TAL: Prompt-Learning with Optimal Transport for Few-Shot Temporal Action Localization}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5853-5862} }

Context-Aware Image Caption Editing via Hallucination-Resistant Visual Instruction Tuning
Yoonhyung Kim,
Byung Ok Kang,
Hwa Jeon Song
[pdf]
[bibtex]
@InProceedings{Kim_2025_ICCV, author = {Kim, Yoonhyung and Kang, Byung Ok and Song, Hwa Jeon}, title = {Context-Aware Image Caption Editing via Hallucination-Resistant Visual Instruction Tuning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5842-5852} }

Noise is an Efficient Learner for Zero-Shot Vision-Language Models
Raza Imam,
Asif Hanif,
Jian Zhang,
Khaled Waleed Dawoud,
Yova Kementchedjhieva,
Mohammad Yaqub
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Imam_2025_ICCV, author = {Imam, Raza and Hanif, Asif and Zhang, Jian and Dawoud, Khaled Waleed and Kementchedjhieva, Yova and Yaqub, Mohammad}, title = {Noise is an Efficient Learner for Zero-Shot Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {5820-5829} }