ICCV 2023 Open Access Repository

Vision-and-Language Algorithmic Reasoning Workshop

MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question Answering: Mobeen Ahmad,

Geonwoo Park,

Dongchan Park,

Sanguk Park; [pdf] [supp]
[bibtex]
@InProceedings{Ahmad_2023_ICCV, author = {Ahmad, Mobeen and Park, Geonwoo and Park, Dongchan and Park, Sanguk}, title = {MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4657-4662} }
Understanding Video Scenes Through Text: Insights from Text-Based Video Question Answering: Soumya Jahagirdar,

Minesh Mathew,

Dimosthenis Karatzas,

C. V. Jawahar; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Jahagirdar_2023_ICCV, author = {Jahagirdar, Soumya and Mathew, Minesh and Karatzas, Dimosthenis and Jawahar, C. V.}, title = {Understanding Video Scenes Through Text: Insights from Text-Based Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4646-4650} }
CLIP-Decoder : ZeroShot Multilabel Classification Using Multimodal CLIP Aligned Representations: Muhammad Ali,

Salman Khan; [pdf] [supp]
[bibtex]
@InProceedings{Ali_2023_ICCV, author = {Ali, Muhammad and Khan, Salman}, title = {CLIP-Decoder : ZeroShot Multilabel Classification Using Multimodal CLIP Aligned Representations}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4675-4679} }
Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language Tasks: Fawaz Sammani,

Nikos Deligiannis; [pdf] [supp]
[bibtex]
@InProceedings{Sammani_2023_ICCV, author = {Sammani, Fawaz and Deligiannis, Nikos}, title = {Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language Tasks}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4634-4639} }
Language-Enhanced RNR-Map: Querying Renderable Neural Radiance Field Maps with Natural Language: Francesco Taioli,

Federico Cunico,

Federico Girella,

Riccardo Bologna,

Alessandro Farinelli,

Marco Cristani; [pdf] [supp]
[bibtex]
@InProceedings{Taioli_2023_ICCV, author = {Taioli, Francesco and Cunico, Federico and Girella, Federico and Bologna, Riccardo and Farinelli, Alessandro and Cristani, Marco}, title = {Language-Enhanced RNR-Map: Querying Renderable Neural Radiance Field Maps with Natural Language}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4669-4674} }
SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-Based Question Answering: Bruno Cesar de Oliveira Souza,

Marius Aasan,

Helio Pedrini,

Adin Ramirez Rivera; [pdf] [supp]
[bibtex]
@InProceedings{de_Oliveira_Souza_2023_ICCV, author = {de Oliveira Souza, Bruno Cesar and Aasan, Marius and Pedrini, Helio and Rivera, Adin Ramirez}, title = {SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-Based Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4640-4645} }
Iterative Robust Visual Grounding with Masked Reference Based Centerpoint Supervision: Menghao Li,

Chunlei Wang,

Wenquan Feng,

Shuchang Lyu,

Guangliang Cheng,

Xiangtai Li,

Binghao Liu,

Qi Zhao; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Li_2023_ICCV, author = {Li, Menghao and Wang, Chunlei and Feng, Wenquan and Lyu, Shuchang and Cheng, Guangliang and Li, Xiangtai and Liu, Binghao and Zhao, Qi}, title = {Iterative Robust Visual Grounding with Masked Reference Based Centerpoint Supervision}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4651-4656} }
Pointing out Human Answer Mistakes in a Goal-Oriented Visual Dialogue: Ryosuke Oshima,

Seitaro Shinagawa,

Hideki Tsunashima,

Qi Feng,

Shigeo Morishima; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Oshima_2023_ICCV, author = {Oshima, Ryosuke and Shinagawa, Seitaro and Tsunashima, Hideki and Feng, Qi and Morishima, Shigeo}, title = {Pointing out Human Answer Mistakes in a Goal-Oriented Visual Dialogue}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4663-4668} }
What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-Modal Language Models: Letian Zhang,

Xiaotong Zhai,

Zhongkai Zhao,

Xin Wen,

Bingchen Zhao; [pdf]
[bibtex]
@InProceedings{Zhang_2023_ICCV, author = {Zhang, Letian and Zhai, Xiaotong and Zhao, Zhongkai and Wen, Xin and Zhao, Bingchen}, title = {What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-Modal Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4629-4633} }