2nd Workshop on ``What is Next in Multimodal Foundation Models?''
ICSVR: Investigating Compositional and Syntactic Understanding in Video Retrieval Models-
[pdf]
[arXiv]
[bibtex]@InProceedings{Madasu_2024_CVPR, author = {Madasu, Avinash and Lal, Vasudev}, title = {ICSVR: Investigating Compositional and Syntactic Understanding in Video Retrieval Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1733-1743} }
Show Think and Tell: Thought-Augmented Fine-Tuning of Large Language Models for Video Captioning-
[pdf]
[supp]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, Byoungjip and Hwang, Dasol and Cho, Sungjun and Jang, Youngsoo and Lee, Honglak and Lee, Moontae}, title = {Show Think and Tell: Thought-Augmented Fine-Tuning of Large Language Models for Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1808-1817} }
LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning-
[pdf]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Junchi and Ke, Lei}, title = {LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1765-1774} }
Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal LLMs-
[pdf]
[bibtex]@InProceedings{Caffagni_2024_CVPR, author = {Caffagni, Davide and Cocchi, Federico and Moratelli, Nicholas and Sarto, Sara and Cornia, Marcella and Baraldi, Lorenzo and Cucchiara, Rita}, title = {Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal LLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1818-1826} }
Probing Conceptual Understanding of Large Visual-Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Schiappa_2024_CVPR, author = {Schiappa, Madeline and Abdullah, Raiyaan and Azad, Shehreen and Claypoole, Jared and Cogswell, Michael and Divakaran, Ajay and Rawat, Yogesh}, title = {Probing Conceptual Understanding of Large Visual-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1797-1807} }
Matting Anything-
[pdf]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Jiachen and Jain, Jitesh and Shi, Humphrey}, title = {Matting Anything}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1775-1785} }
Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Smith_2024_CVPR, author = {Smith, James Seale and Hsu, Yen-Chang and Kira, Zsolt and Shen, Yilin and Jin, Hongxia}, title = {Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1744-1754} }
Forget-Me-Not: Learning to Forget in Text-to-Image Diffusion Models-
[pdf]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Gong and Wang, Kai and Xu, Xingqian and Wang, Zhangyang and Shi, Humphrey}, title = {Forget-Me-Not: Learning to Forget in Text-to-Image Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1755-1764} }
Robustness Analysis on Foundational Segmentation Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Schiappa_2024_CVPR, author = {Schiappa, Madeline Chantry and Azad, Shehreen and Vs, Sachidanand and Ge, Yunhao and Miksik, Ondrej and Rawat, Yogesh S and Vineet, Vibhav}, title = {Robustness Analysis on Foundational Segmentation Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1786-1796} }
Strategies to Leverage Foundational Model Knowledge in Object Affordance Grounding-
[pdf]
[bibtex]@InProceedings{Rai_2024_CVPR, author = {Rai, Arushi and Buettner, Kyle and Kovashka, Adriana}, title = {Strategies to Leverage Foundational Model Knowledge in Object Affordance Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1714-1723} }
Recognize Anything: A Strong Image Tagging Model-
[pdf]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Youcai and Huang, Xinyu and Ma, Jinyu and Li, Zhaoyang and Luo, Zhaochuan and Xie, Yanchun and Qin, Yuzhuo and Luo, Tong and Li, Yaqian and Liu, Shilong and Guo, Yandong and Zhang, Lei}, title = {Recognize Anything: A Strong Image Tagging Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1724-1732} }
Towards Efficient Audio-Visual Learners via Empowering Pre-trained Vision Transformers with Cross-Modal Adaptation-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Kai and Tian, Yapeng and Hatzinakos, Dimitrios}, title = {Towards Efficient Audio-Visual Learners via Empowering Pre-trained Vision Transformers with Cross-Modal Adaptation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1837-1846} }
Benchmarking Zero-Shot Recognition with Vision-Language Models: Challenges on Granularity and Specificity-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Zhenlin and Zhu, Yi and Deng, Siqi and Mittal, Abhay and Chen, Yanbei and Wang, Manchen and Favaro, Paolo and Tighe, Joseph and Modolo, Davide}, title = {Benchmarking Zero-Shot Recognition with Vision-Language Models: Challenges on Granularity and Specificity}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1827-1836} }