2nd Workshop on ``What is Next in Multimodal Foundation Models?''


ICSVR: Investigating Compositional and Syntactic Understanding in Video Retrieval Models
Avinash Madasu,
Vasudev Lal
[pdf] [arXiv]
[bibtex]
@InProceedings{Madasu_2024_CVPR, author = {Madasu, Avinash and Lal, Vasudev}, title = {ICSVR: Investigating Compositional and Syntactic Understanding in Video Retrieval Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1733-1743} }

Show Think and Tell: Thought-Augmented Fine-Tuning of Large Language Models for Video Captioning
Byoungjip Kim,
Dasol Hwang,
Sungjun Cho,
Youngsoo Jang,
Honglak Lee,
Moontae Lee
[pdf] [supp]
[bibtex]
@InProceedings{Kim_2024_CVPR, author = {Kim, Byoungjip and Hwang, Dasol and Cho, Sungjun and Jang, Youngsoo and Lee, Honglak and Lee, Moontae}, title = {Show Think and Tell: Thought-Augmented Fine-Tuning of Large Language Models for Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1808-1817} }

LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning
Junchi Wang,
Lei Ke
[pdf]
[bibtex]
@InProceedings{Wang_2024_CVPR, author = {Wang, Junchi and Ke, Lei}, title = {LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1765-1774} }

Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal LLMs
Davide Caffagni,
Federico Cocchi,
Nicholas Moratelli,
Sara Sarto,
Marcella Cornia,
Lorenzo Baraldi,
Rita Cucchiara
[pdf]
[bibtex]
@InProceedings{Caffagni_2024_CVPR, author = {Caffagni, Davide and Cocchi, Federico and Moratelli, Nicholas and Sarto, Sara and Cornia, Marcella and Baraldi, Lorenzo and Cucchiara, Rita}, title = {Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal LLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1818-1826} }

Probing Conceptual Understanding of Large Visual-Language Models
Madeline Schiappa,
Raiyaan Abdullah,
Shehreen Azad,
Jared Claypoole,
Michael Cogswell,
Ajay Divakaran,
Yogesh Rawat
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Schiappa_2024_CVPR, author = {Schiappa, Madeline and Abdullah, Raiyaan and Azad, Shehreen and Claypoole, Jared and Cogswell, Michael and Divakaran, Ajay and Rawat, Yogesh}, title = {Probing Conceptual Understanding of Large Visual-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1797-1807} }

Matting Anything
Jiachen Li,
Jitesh Jain,
Humphrey Shi
[pdf] [arXiv]
[bibtex]
@InProceedings{Li_2024_CVPR, author = {Li, Jiachen and Jain, Jitesh and Shi, Humphrey}, title = {Matting Anything}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1775-1785} }

Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters
James Seale Smith,
Yen-Chang Hsu,
Zsolt Kira,
Yilin Shen,
Hongxia Jin
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Smith_2024_CVPR, author = {Smith, James Seale and Hsu, Yen-Chang and Kira, Zsolt and Shen, Yilin and Jin, Hongxia}, title = {Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1744-1754} }

Forget-Me-Not: Learning to Forget in Text-to-Image Diffusion Models
Gong Zhang,
Kai Wang,
Xingqian Xu,
Zhangyang Wang,
Humphrey Shi
[pdf]
[bibtex]
@InProceedings{Zhang_2024_CVPR, author = {Zhang, Gong and Wang, Kai and Xu, Xingqian and Wang, Zhangyang and Shi, Humphrey}, title = {Forget-Me-Not: Learning to Forget in Text-to-Image Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1755-1764} }

Robustness Analysis on Foundational Segmentation Models
Madeline Chantry Schiappa,
Shehreen Azad,
Sachidanand Vs,
Yunhao Ge,
Ondrej Miksik,
Yogesh S Rawat,
Vibhav Vineet
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Schiappa_2024_CVPR, author = {Schiappa, Madeline Chantry and Azad, Shehreen and Vs, Sachidanand and Ge, Yunhao and Miksik, Ondrej and Rawat, Yogesh S and Vineet, Vibhav}, title = {Robustness Analysis on Foundational Segmentation Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1786-1796} }

Strategies to Leverage Foundational Model Knowledge in Object Affordance Grounding
Arushi Rai,
Kyle Buettner,
Adriana Kovashka
[pdf]
[bibtex]
@InProceedings{Rai_2024_CVPR, author = {Rai, Arushi and Buettner, Kyle and Kovashka, Adriana}, title = {Strategies to Leverage Foundational Model Knowledge in Object Affordance Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1714-1723} }

Recognize Anything: A Strong Image Tagging Model
Youcai Zhang,
Xinyu Huang,
Jinyu Ma,
Zhaoyang Li,
Zhaochuan Luo,
Yanchun Xie,
Yuzhuo Qin,
Tong Luo,
Yaqian Li,
Shilong Liu,
Yandong Guo,
Lei Zhang
[pdf] [arXiv]
[bibtex]
@InProceedings{Zhang_2024_CVPR, author = {Zhang, Youcai and Huang, Xinyu and Ma, Jinyu and Li, Zhaoyang and Luo, Zhaochuan and Xie, Yanchun and Qin, Yuzhuo and Luo, Tong and Li, Yaqian and Liu, Shilong and Guo, Yandong and Zhang, Lei}, title = {Recognize Anything: A Strong Image Tagging Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1724-1732} }

Towards Efficient Audio-Visual Learners via Empowering Pre-trained Vision Transformers with Cross-Modal Adaptation
Kai Wang,
Yapeng Tian,
Dimitrios Hatzinakos
[pdf] [supp]
[bibtex]
@InProceedings{Wang_2024_CVPR, author = {Wang, Kai and Tian, Yapeng and Hatzinakos, Dimitrios}, title = {Towards Efficient Audio-Visual Learners via Empowering Pre-trained Vision Transformers with Cross-Modal Adaptation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1837-1846} }

Benchmarking Zero-Shot Recognition with Vision-Language Models: Challenges on Granularity and Specificity
Zhenlin Xu,
Yi Zhu,
Siqi Deng,
Abhay Mittal,
Yanbei Chen,
Manchen Wang,
Paolo Favaro,
Joseph Tighe,
Davide Modolo
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Xu_2024_CVPR, author = {Xu, Zhenlin and Zhu, Yi and Deng, Siqi and Mittal, Abhay and Chen, Yanbei and Wang, Manchen and Favaro, Paolo and Tighe, Joseph and Modolo, Davide}, title = {Benchmarking Zero-Shot Recognition with Vision-Language Models: Challenges on Granularity and Specificity}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1827-1836} }