ICCV 2023 Open Access Repository

5th Workshop on Closing the Loop Between Vision and Language

Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities Between Gender Groups: Melissa Hall,

Laura Gustafson,

Aaron Adcock,

Ishan Misra,

Candace Ross; [pdf]
[bibtex]
@InProceedings{Hall_2023_ICCV, author = {Hall, Melissa and Gustafson, Laura and Adcock, Aaron and Misra, Ishan and Ross, Candace}, title = {Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities Between Gender Groups}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2778-2785} }
Multimodal Neurons in Pretrained Text-Only Transformers: Sarah Schwettmann,

Neil Chowdhury,

Samuel Klein,

David Bau,

Antonio Torralba; [pdf] [supp]
[bibtex]
@InProceedings{Schwettmann_2023_ICCV, author = {Schwettmann, Sarah and Chowdhury, Neil and Klein, Samuel and Bau, David and Torralba, Antonio}, title = {Multimodal Neurons in Pretrained Text-Only Transformers}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2862-2867} }
Explaining Vision and Language Through Graphs of Events in Space and Time: Mihai Masala,

Nicolae Cudlenco,

Traian Rebedea,

Marius Leordeanu; [pdf] [arXiv]
[bibtex]
@InProceedings{Masala_2023_ICCV, author = {Masala, Mihai and Cudlenco, Nicolae and Rebedea, Traian and Leordeanu, Marius}, title = {Explaining Vision and Language Through Graphs of Events in Space and Time}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2826-2831} }
Sparse Linear Concept Discovery Models: Konstantinos Panagiotis Panousis,

Dino Ienco,

Diego Marcos; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Panousis_2023_ICCV, author = {Panousis, Konstantinos Panagiotis and Ienco, Dino and Marcos, Diego}, title = {Sparse Linear Concept Discovery Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2767-2771} }
LLaViLo: Boosting Video Moment Retrieval via Adapter-Based Multimodal Modeling: Kaijing Ma,

Xianghao Zang,

Zerun Feng,

Han Fang,

Chao Ban,

Yuhan Wei,

Zhongjiang He,

Yongxiang Li,

Hao Sun; [pdf]
[bibtex]
@InProceedings{Ma_2023_ICCV, author = {Ma, Kaijing and Zang, Xianghao and Feng, Zerun and Fang, Han and Ban, Chao and Wei, Yuhan and He, Zhongjiang and Li, Yongxiang and Sun, Hao}, title = {LLaViLo: Boosting Video Moment Retrieval via Adapter-Based Multimodal Modeling}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2798-2803} }
An Empirical Study of the Effect of Video Encoders on Temporal Video Grounding: Ignacio M. De la Jara,

Cristian Rodriguez-Opazo,

Edison Marrese-Taylor,

Felipe Bravo-Marquez; [pdf]
[bibtex]
@InProceedings{De_la_Jara_2023_ICCV, author = {De la Jara, Ignacio M. and Rodriguez-Opazo, Cristian and Marrese-Taylor, Edison and Bravo-Marquez, Felipe}, title = {An Empirical Study of the Effect of Video Encoders on Temporal Video Grounding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2850-2855} }
PatFig: Generating Short and Long Captions for Patent Figures: Dana Aubakirova,

Kim Gerdes,

Lufei Liu; [pdf] [arXiv]
[bibtex]
@InProceedings{Aubakirova_2023_ICCV, author = {Aubakirova, Dana and Gerdes, Kim and Liu, Lufei}, title = {PatFig: Generating Short and Long Captions for Patent Figures}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2843-2849} }
A Cross-Dataset Study on the Brazilian Sign Language Translation: Amanda Hellen de Avellar Sarmento,

Moacir Antonelli Ponti; [pdf]
[bibtex]
@InProceedings{de_Avellar_Sarmento_2023_ICCV, author = {de Avellar Sarmento, Amanda Hellen and Ponti, Moacir Antonelli}, title = {A Cross-Dataset Study on the Brazilian Sign Language Translation}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2816-2820} }
ECO: Ensembling Context Optimization for Vision-Language Models: Lorenzo Agnolucci,

Alberto Baldrati,

Francesco Todino,

Federico Becattini,

Marco Bertini,

Alberto Del Bimbo; [pdf] [arXiv]
[bibtex]
@InProceedings{Agnolucci_2023_ICCV, author = {Agnolucci, Lorenzo and Baldrati, Alberto and Todino, Francesco and Becattini, Federico and Bertini, Marco and Del Bimbo, Alberto}, title = {ECO: Ensembling Context Optimization for Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2811-2815} }
Cross-Modal Dense Passage Retrieval for Outside Knowledge Visual Question Answering: Benjamin Reichman,

Larry Heck; [pdf]
[bibtex]
@InProceedings{Reichman_2023_ICCV, author = {Reichman, Benjamin and Heck, Larry}, title = {Cross-Modal Dense Passage Retrieval for Outside Knowledge Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2837-2842} }
Towards Vision-Language Mechanistic Interpretability: A Causal Tracing Tool for BLIP: Vedant Palit,

Rohan Pandey,

Aryaman Arora,

Paul Pu Liang; [pdf] [arXiv]
[bibtex]
@InProceedings{Palit_2023_ICCV, author = {Palit, Vedant and Pandey, Rohan and Arora, Aryaman and Liang, Paul Pu}, title = {Towards Vision-Language Mechanistic Interpretability: A Causal Tracing Tool for BLIP}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2856-2861} }
Mapping Memes to Words for Multimodal Hateful Meme Classification: Giovanni Burbi,

Alberto Baldrati,

Lorenzo Agnolucci,

Marco Bertini,

Alberto Del Bimbo; [pdf]
[bibtex]
@InProceedings{Burbi_2023_ICCV, author = {Burbi, Giovanni and Baldrati, Alberto and Agnolucci, Lorenzo and Bertini, Marco and Del Bimbo, Alberto}, title = {Mapping Memes to Words for Multimodal Hateful Meme Classification}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2832-2836} }
BiLMa: Bidirectional Local-Matching for Text-based Person Re-identification: Takuro Fujii,

Shuhei Tarashima; [pdf] [arXiv]
[bibtex]
@InProceedings{Fujii_2023_ICCV, author = {Fujii, Takuro and Tarashima, Shuhei}, title = {BiLMa: Bidirectional Local-Matching for Text-based Person Re-identification}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2786-2790} }
ProVLA: Compositional Image Search with Progressive Vision-Language Alignment and Multimodal Fusion: Zhizhang Hu,

Xinliang Zhu,

Son Tran,

René Vidal,

Arnab Dhua; [pdf]
[bibtex]
@InProceedings{Hu_2023_ICCV, author = {Hu, Zhizhang and Zhu, Xinliang and Tran, Son and Vidal, Ren\'e and Dhua, Arnab}, title = {ProVLA: Compositional Image Search with Progressive Vision-Language Alignment and Multimodal Fusion}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2772-2777} }
Context-VQA: Towards Context-Aware and Purposeful Visual Question Answering: Nandita Naik,

Christopher Potts,

Elisa Kreiss; [pdf]
[bibtex]
@InProceedings{Naik_2023_ICCV, author = {Naik, Nandita and Potts, Christopher and Kreiss, Elisa}, title = {Context-VQA: Towards Context-Aware and Purposeful Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2821-2825} }
Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts: Deniz Engin,

Yannis Avrithis; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Engin_2023_ICCV, author = {Engin, Deniz and Avrithis, Yannis}, title = {Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2804-2810} }
Alignment and Generation Adapter for Efficient Video-Text Understanding: Han Fang,

Zhifei Yang,

Yuhan Wei,

Xianghao Zang,

Chao Ban,

Zerun Feng,

Zhongjiang He,

Yongxiang Li,

Hao Sun; [pdf]
[bibtex]
@InProceedings{Fang_2023_ICCV, author = {Fang, Han and Yang, Zhifei and Wei, Yuhan and Zang, Xianghao and Ban, Chao and Feng, Zerun and He, Zhongjiang and Li, Yongxiang and Sun, Hao}, title = {Alignment and Generation Adapter for Efficient Video-Text Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2791-2797} }