What is Next in Multimodal Foundation Models?


Coarse to Fine Frame Selection for Online Open-Ended Video Question Answering
Vidyaranya Nuthalapati,
Anirudh Tunga
[pdf]
[bibtex]
@InProceedings{Nuthalapati_2023_ICCV, author = {Nuthalapati, Vidyaranya and Tunga, Anirudh}, title = {Coarse to Fine Frame Selection for Online Open-Ended Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {353-361} }

Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen Large Language Models
Junting Pan,
Ziyi Lin,
Yuying Ge,
Xiatian Zhu,
Renrui Zhang,
Yi Wang,
Yu Qiao,
Hongsheng Li
[pdf]
[bibtex]
@InProceedings{Pan_2023_ICCV, author = {Pan, Junting and Lin, Ziyi and Ge, Yuying and Zhu, Xiatian and Zhang, Renrui and Wang, Yi and Qiao, Yu and Li, Hongsheng}, title = {Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen Large Language Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {272-283} }

Video-and-Language (VidL) models and their cognitive relevance
Anne Zonneveld,
Albert Gatt,
Iacer Calixto
[pdf] [supp]
[bibtex]
@InProceedings{Zonneveld_2023_ICCV, author = {Zonneveld, Anne and Gatt, Albert and Calixto, Iacer}, title = {Video-and-Language (VidL) models and their cognitive relevance}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {325-338} }

Video Attribute Prototype Network: A New Perspective for Zero-Shot Video Classification
Bo Wang,
Kaili Zhao,
Hongyang Zhao,
Shi Pu,
Bo Xiao,
Jun Guo
[pdf]
[bibtex]
@InProceedings{Wang_2023_ICCV, author = {Wang, Bo and Zhao, Kaili and Zhao, Hongyang and Pu, Shi and Xiao, Bo and Guo, Jun}, title = {Video Attribute Prototype Network: A New Perspective for Zero-Shot Video Classification}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {315-324} }

Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action Detection
Wei-Jhe Huang,
Jheng-Hsien Yeh,
Min-Hung Chen,
Gueter Josmy Faure,
Shang-Hong Lai
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Huang_2023_ICCV, author = {Huang, Wei-Jhe and Yeh, Jheng-Hsien and Chen, Min-Hung and Faure, Gueter Josmy and Lai, Shang-Hong}, title = {Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action Detection}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {284-293} }

ClipCrop: Conditioned Cropping Driven by Vision-Language Model
Zhihang Zhong,
Mingxi Cheng,
Zhirong Wu,
Yuhui Yuan,
Yinqiang Zheng,
Ji Li,
Han Hu,
Stephen Lin,
Yoichi Sato,
Imari Sato
[pdf] [arXiv]
[bibtex]
@InProceedings{Zhong_2023_ICCV, author = {Zhong, Zhihang and Cheng, Mingxi and Wu, Zhirong and Yuan, Yuhui and Zheng, Yinqiang and Li, Ji and Hu, Han and Lin, Stephen and Sato, Yoichi and Sato, Imari}, title = {ClipCrop: Conditioned Cropping Driven by Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {294-304} }

Towards an Exhaustive Evaluation of Vision-Language Foundation Models
Emmanuelle Salin,
Stéphane Ayache,
Benoit Favre
[pdf] [supp]
[bibtex]
@InProceedings{Salin_2023_ICCV, author = {Salin, Emmanuelle and Ayache, St\'ephane and Favre, Benoit}, title = {Towards an Exhaustive Evaluation of Vision-Language Foundation Models}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {339-352} }

Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts
Mayug Maniparambil,
Chris Vorster,
Derek Molloy,
Noel Murphy,
Kevin McGuinness,
Noel E. O'Connor
[pdf] [supp]
[bibtex]
@InProceedings{Maniparambil_2023_ICCV, author = {Maniparambil, Mayug and Vorster, Chris and Molloy, Derek and Murphy, Noel and McGuinness, Kevin and O'Connor, Noel E.}, title = {Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {262-271} }

Painter: Teaching Auto-Regressive Language Models to Draw Sketches
Reza Pourreza,
Apratim Bhattacharyya,
Sunny Panchal,
Mingu Lee,
Pulkit Madan,
Roland Memisevic
[pdf] [arXiv]
[bibtex]
@InProceedings{Pourreza_2023_ICCV, author = {Pourreza, Reza and Bhattacharyya, Apratim and Panchal, Sunny and Lee, Mingu and Madan, Pulkit and Memisevic, Roland}, title = {Painter: Teaching Auto-Regressive Language Models to Draw Sketches}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2023}, pages = {305-314} }