ICCV 2023 Open Access Repository

What is Next in Multimodal Foundation Models?

Coarse to Fine Frame Selection for Online Open-Ended Video Question Answering: Vidyaranya Nuthalapati,

Anirudh Tunga; [pdf]
[bibtex]
@InProceedings{Nuthalapati_2023_ICCV, author = {Nuthalapati, Vidyaranya and Tunga, Anirudh}, title = {Coarse to Fine Frame Selection for Online Open-Ended Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {353-361} }
Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen Large Language Models: Junting Pan,

Ziyi Lin,

Yuying Ge,

Xiatian Zhu,

Renrui Zhang,

Yi Wang,

Yu Qiao,

Hongsheng Li; [pdf]
[bibtex]
@InProceedings{Pan_2023_ICCV, author = {Pan, Junting and Lin, Ziyi and Ge, Yuying and Zhu, Xiatian and Zhang, Renrui and Wang, Yi and Qiao, Yu and Li, Hongsheng}, title = {Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen Large Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {272-283} }
Video-and-Language (VidL) models and their cognitive relevance: Anne Zonneveld,

Albert Gatt,

Iacer Calixto; [pdf] [supp]
[bibtex]
@InProceedings{Zonneveld_2023_ICCV, author = {Zonneveld, Anne and Gatt, Albert and Calixto, Iacer}, title = {Video-and-Language (VidL) models and their cognitive relevance}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {325-338} }
Video Attribute Prototype Network: A New Perspective for Zero-Shot Video Classification: Bo Wang,

Kaili Zhao,

Hongyang Zhao,

Shi Pu,

Bo Xiao,

Jun Guo; [pdf]
[bibtex]
@InProceedings{Wang_2023_ICCV, author = {Wang, Bo and Zhao, Kaili and Zhao, Hongyang and Pu, Shi and Xiao, Bo and Guo, Jun}, title = {Video Attribute Prototype Network: A New Perspective for Zero-Shot Video Classification}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {315-324} }
Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action Detection: Wei-Jhe Huang,

Jheng-Hsien Yeh,

Min-Hung Chen,

Gueter Josmy Faure,

Shang-Hong Lai; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Huang_2023_ICCV, author = {Huang, Wei-Jhe and Yeh, Jheng-Hsien and Chen, Min-Hung and Faure, Gueter Josmy and Lai, Shang-Hong}, title = {Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action Detection}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {284-293} }
ClipCrop: Conditioned Cropping Driven by Vision-Language Model: Zhihang Zhong,

Mingxi Cheng,

Zhirong Wu,

Yuhui Yuan,

Yinqiang Zheng,

Ji Li,

Han Hu,

Stephen Lin,

Yoichi Sato,

Imari Sato; [pdf] [arXiv]
[bibtex]
@InProceedings{Zhong_2023_ICCV, author = {Zhong, Zhihang and Cheng, Mingxi and Wu, Zhirong and Yuan, Yuhui and Zheng, Yinqiang and Li, Ji and Hu, Han and Lin, Stephen and Sato, Yoichi and Sato, Imari}, title = {ClipCrop: Conditioned Cropping Driven by Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {294-304} }
Towards an Exhaustive Evaluation of Vision-Language Foundation Models: Emmanuelle Salin,

Stéphane Ayache,

Benoit Favre; [pdf] [supp]
[bibtex]
@InProceedings{Salin_2023_ICCV, author = {Salin, Emmanuelle and Ayache, St\'ephane and Favre, Benoit}, title = {Towards an Exhaustive Evaluation of Vision-Language Foundation Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {339-352} }
Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts: Mayug Maniparambil,

Chris Vorster,

Derek Molloy,

Noel Murphy,

Kevin McGuinness,

Noel E. O'Connor; [pdf] [supp]
[bibtex]
@InProceedings{Maniparambil_2023_ICCV, author = {Maniparambil, Mayug and Vorster, Chris and Molloy, Derek and Murphy, Noel and McGuinness, Kevin and O'Connor, Noel E.}, title = {Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {262-271} }
Painter: Teaching Auto-Regressive Language Models to Draw Sketches: Reza Pourreza,

Apratim Bhattacharyya,

Sunny Panchal,

Mingu Lee,

Pulkit Madan,

Roland Memisevic; [pdf] [arXiv]
[bibtex]
@InProceedings{Pourreza_2023_ICCV, author = {Pourreza, Reza and Bhattacharyya, Apratim and Panchal, Sunny and Lee, Mingu and Madan, Pulkit and Memisevic, Roland}, title = {Painter: Teaching Auto-Regressive Language Models to Draw Sketches}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {305-314} }