5th Workshop on Closing the Loop Between Vision and Language


Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities Between Gender Groups
Melissa Hall,
Laura Gustafson,
Aaron Adcock,
Ishan Misra,
Candace Ross
[pdf]
[bibtex]
@InProceedings{Hall_2023_ICCV, author = {Hall, Melissa and Gustafson, Laura and Adcock, Aaron and Misra, Ishan and Ross, Candace}, title = {Vision-Language Models Performing Zero-Shot Tasks Exhibit Disparities Between Gender Groups}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2778-2785} }

Multimodal Neurons in Pretrained Text-Only Transformers
Sarah Schwettmann,
Neil Chowdhury,
Samuel Klein,
David Bau,
Antonio Torralba
[pdf] [supp]
[bibtex]
@InProceedings{Schwettmann_2023_ICCV, author = {Schwettmann, Sarah and Chowdhury, Neil and Klein, Samuel and Bau, David and Torralba, Antonio}, title = {Multimodal Neurons in Pretrained Text-Only Transformers}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2862-2867} }

Explaining Vision and Language Through Graphs of Events in Space and Time
Mihai Masala,
Nicolae Cudlenco,
Traian Rebedea,
Marius Leordeanu
[pdf] [arXiv]
[bibtex]
@InProceedings{Masala_2023_ICCV, author = {Masala, Mihai and Cudlenco, Nicolae and Rebedea, Traian and Leordeanu, Marius}, title = {Explaining Vision and Language Through Graphs of Events in Space and Time}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2826-2831} }

Sparse Linear Concept Discovery Models
Konstantinos Panagiotis Panousis,
Dino Ienco,
Diego Marcos
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Panousis_2023_ICCV, author = {Panousis, Konstantinos Panagiotis and Ienco, Dino and Marcos, Diego}, title = {Sparse Linear Concept Discovery Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2767-2771} }

LLaViLo: Boosting Video Moment Retrieval via Adapter-Based Multimodal Modeling
Kaijing Ma,
Xianghao Zang,
Zerun Feng,
Han Fang,
Chao Ban,
Yuhan Wei,
Zhongjiang He,
Yongxiang Li,
Hao Sun
[pdf]
[bibtex]
@InProceedings{Ma_2023_ICCV, author = {Ma, Kaijing and Zang, Xianghao and Feng, Zerun and Fang, Han and Ban, Chao and Wei, Yuhan and He, Zhongjiang and Li, Yongxiang and Sun, Hao}, title = {LLaViLo: Boosting Video Moment Retrieval via Adapter-Based Multimodal Modeling}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2798-2803} }

An Empirical Study of the Effect of Video Encoders on Temporal Video Grounding
Ignacio M. De la Jara,
Cristian Rodriguez-Opazo,
Edison Marrese-Taylor,
Felipe Bravo-Marquez
[pdf]
[bibtex]
@InProceedings{De_la_Jara_2023_ICCV, author = {De la Jara, Ignacio M. and Rodriguez-Opazo, Cristian and Marrese-Taylor, Edison and Bravo-Marquez, Felipe}, title = {An Empirical Study of the Effect of Video Encoders on Temporal Video Grounding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2850-2855} }

PatFig: Generating Short and Long Captions for Patent Figures
Dana Aubakirova,
Kim Gerdes,
Lufei Liu
[pdf] [arXiv]
[bibtex]
@InProceedings{Aubakirova_2023_ICCV, author = {Aubakirova, Dana and Gerdes, Kim and Liu, Lufei}, title = {PatFig: Generating Short and Long Captions for Patent Figures}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2843-2849} }

A Cross-Dataset Study on the Brazilian Sign Language Translation
Amanda Hellen de Avellar Sarmento,
Moacir Antonelli Ponti
[pdf]
[bibtex]
@InProceedings{de_Avellar_Sarmento_2023_ICCV, author = {de Avellar Sarmento, Amanda Hellen and Ponti, Moacir Antonelli}, title = {A Cross-Dataset Study on the Brazilian Sign Language Translation}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2816-2820} }

ECO: Ensembling Context Optimization for Vision-Language Models
Lorenzo Agnolucci,
Alberto Baldrati,
Francesco Todino,
Federico Becattini,
Marco Bertini,
Alberto Del Bimbo
[pdf] [arXiv]
[bibtex]
@InProceedings{Agnolucci_2023_ICCV, author = {Agnolucci, Lorenzo and Baldrati, Alberto and Todino, Francesco and Becattini, Federico and Bertini, Marco and Del Bimbo, Alberto}, title = {ECO: Ensembling Context Optimization for Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2811-2815} }

Cross-Modal Dense Passage Retrieval for Outside Knowledge Visual Question Answering
Benjamin Reichman,
Larry Heck
[pdf]
[bibtex]
@InProceedings{Reichman_2023_ICCV, author = {Reichman, Benjamin and Heck, Larry}, title = {Cross-Modal Dense Passage Retrieval for Outside Knowledge Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2837-2842} }

Towards Vision-Language Mechanistic Interpretability: A Causal Tracing Tool for BLIP
Vedant Palit,
Rohan Pandey,
Aryaman Arora,
Paul Pu Liang
[pdf] [arXiv]
[bibtex]
@InProceedings{Palit_2023_ICCV, author = {Palit, Vedant and Pandey, Rohan and Arora, Aryaman and Liang, Paul Pu}, title = {Towards Vision-Language Mechanistic Interpretability: A Causal Tracing Tool for BLIP}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2856-2861} }

Mapping Memes to Words for Multimodal Hateful Meme Classification
Giovanni Burbi,
Alberto Baldrati,
Lorenzo Agnolucci,
Marco Bertini,
Alberto Del Bimbo
[pdf]
[bibtex]
@InProceedings{Burbi_2023_ICCV, author = {Burbi, Giovanni and Baldrati, Alberto and Agnolucci, Lorenzo and Bertini, Marco and Del Bimbo, Alberto}, title = {Mapping Memes to Words for Multimodal Hateful Meme Classification}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2832-2836} }

BiLMa: Bidirectional Local-Matching for Text-based Person Re-identification
Takuro Fujii,
Shuhei Tarashima
[pdf] [arXiv]
[bibtex]
@InProceedings{Fujii_2023_ICCV, author = {Fujii, Takuro and Tarashima, Shuhei}, title = {BiLMa: Bidirectional Local-Matching for Text-based Person Re-identification}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2786-2790} }

ProVLA: Compositional Image Search with Progressive Vision-Language Alignment and Multimodal Fusion
Zhizhang Hu,
Xinliang Zhu,
Son Tran,
René Vidal,
Arnab Dhua
[pdf]
[bibtex]
@InProceedings{Hu_2023_ICCV, author = {Hu, Zhizhang and Zhu, Xinliang and Tran, Son and Vidal, Ren\'e and Dhua, Arnab}, title = {ProVLA: Compositional Image Search with Progressive Vision-Language Alignment and Multimodal Fusion}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2772-2777} }

Context-VQA: Towards Context-Aware and Purposeful Visual Question Answering
Nandita Naik,
Christopher Potts,
Elisa Kreiss
[pdf]
[bibtex]
@InProceedings{Naik_2023_ICCV, author = {Naik, Nandita and Potts, Christopher and Kreiss, Elisa}, title = {Context-VQA: Towards Context-Aware and Purposeful Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2821-2825} }

Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts
Deniz Engin,
Yannis Avrithis
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Engin_2023_ICCV, author = {Engin, Deniz and Avrithis, Yannis}, title = {Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2804-2810} }

Alignment and Generation Adapter for Efficient Video-Text Understanding
Han Fang,
Zhifei Yang,
Yuhan Wei,
Xianghao Zang,
Chao Ban,
Zerun Feng,
Zhongjiang He,
Yongxiang Li,
Hao Sun
[pdf]
[bibtex]
@InProceedings{Fang_2023_ICCV, author = {Fang, Han and Yang, Zhifei and Wei, Yuhan and Zang, Xianghao and Ban, Chao and Feng, Zerun and He, Zhongjiang and Li, Yongxiang and Sun, Hao}, title = {Alignment and Generation Adapter for Efficient Video-Text Understanding}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {2791-2797} }