Vision-and-Language Algorithmic Reasoning Workshop


MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question Answering
Mobeen Ahmad,
Geonwoo Park,
Dongchan Park,
Sanguk Park
[pdf] [supp]
[bibtex]
@InProceedings{Ahmad_2023_ICCV, author = {Ahmad, Mobeen and Park, Geonwoo and Park, Dongchan and Park, Sanguk}, title = {MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4657-4662} }

Understanding Video Scenes Through Text: Insights from Text-Based Video Question Answering
Soumya Jahagirdar,
Minesh Mathew,
Dimosthenis Karatzas,
C. V. Jawahar
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Jahagirdar_2023_ICCV, author = {Jahagirdar, Soumya and Mathew, Minesh and Karatzas, Dimosthenis and Jawahar, C. V.}, title = {Understanding Video Scenes Through Text: Insights from Text-Based Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4646-4650} }

CLIP-Decoder : ZeroShot Multilabel Classification Using Multimodal CLIP Aligned Representations
Muhammad Ali,
Salman Khan
[pdf] [supp]
[bibtex]
@InProceedings{Ali_2023_ICCV, author = {Ali, Muhammad and Khan, Salman}, title = {CLIP-Decoder : ZeroShot Multilabel Classification Using Multimodal CLIP Aligned Representations}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4675-4679} }

Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language Tasks
Fawaz Sammani,
Nikos Deligiannis
[pdf] [supp]
[bibtex]
@InProceedings{Sammani_2023_ICCV, author = {Sammani, Fawaz and Deligiannis, Nikos}, title = {Uni-NLX: Unifying Textual Explanations for Vision and Vision-Language Tasks}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4634-4639} }

Language-Enhanced RNR-Map: Querying Renderable Neural Radiance Field Maps with Natural Language
Francesco Taioli,
Federico Cunico,
Federico Girella,
Riccardo Bologna,
Alessandro Farinelli,
Marco Cristani
[pdf] [supp]
[bibtex]
@InProceedings{Taioli_2023_ICCV, author = {Taioli, Francesco and Cunico, Federico and Girella, Federico and Bologna, Riccardo and Farinelli, Alessandro and Cristani, Marco}, title = {Language-Enhanced RNR-Map: Querying Renderable Neural Radiance Field Maps with Natural Language}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4669-4674} }

SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-Based Question Answering
Bruno Cesar de Oliveira Souza,
Marius Aasan,
Helio Pedrini,
Adin Ramirez Rivera
[pdf] [supp]
[bibtex]
@InProceedings{de_Oliveira_Souza_2023_ICCV, author = {de Oliveira Souza, Bruno Cesar and Aasan, Marius and Pedrini, Helio and Rivera, Adin Ramirez}, title = {SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-Based Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4640-4645} }

Iterative Robust Visual Grounding with Masked Reference Based Centerpoint Supervision
Menghao Li,
Chunlei Wang,
Wenquan Feng,
Shuchang Lyu,
Guangliang Cheng,
Xiangtai Li,
Binghao Liu,
Qi Zhao
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Li_2023_ICCV, author = {Li, Menghao and Wang, Chunlei and Feng, Wenquan and Lyu, Shuchang and Cheng, Guangliang and Li, Xiangtai and Liu, Binghao and Zhao, Qi}, title = {Iterative Robust Visual Grounding with Masked Reference Based Centerpoint Supervision}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4651-4656} }

Pointing out Human Answer Mistakes in a Goal-Oriented Visual Dialogue
Ryosuke Oshima,
Seitaro Shinagawa,
Hideki Tsunashima,
Qi Feng,
Shigeo Morishima
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Oshima_2023_ICCV, author = {Oshima, Ryosuke and Shinagawa, Seitaro and Tsunashima, Hideki and Feng, Qi and Morishima, Shigeo}, title = {Pointing out Human Answer Mistakes in a Goal-Oriented Visual Dialogue}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4663-4668} }

What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-Modal Language Models
Letian Zhang,
Xiaotong Zhai,
Zhongkai Zhao,
Xin Wen,
Bingchen Zhao
[pdf]
[bibtex]
@InProceedings{Zhang_2023_ICCV, author = {Zhang, Letian and Zhai, Xiaotong and Zhao, Zhongkai and Wen, Xin and Zhao, Bingchen}, title = {What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-Modal Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {4629-4633} }