8th Multimodal Learning and Applications Workshop
Location-Free Scene Graph Generation-
[pdf]
[supp]
[bibtex]@InProceedings{Ozsoy_2025_CVPR, author = {\"Ozsoy, Ege and Holm, Felix and Pellegrini, Chantal and Czempiel, Tobias and Saleh, Mahdi and Navab, Nassir and Busam, Benjamin}, title = {Location-Free Scene Graph Generation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {108-117} }
Multimodal Rationales for Explainable Visual Question Answering-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2025_CVPR, author = {Li, Kun and Vosselman, George and Yang, Michael Ying}, title = {Multimodal Rationales for Explainable Visual Question Answering}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {191-201} }
Pose-Aware Weakly-Supervised Action Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhao_2025_CVPR, author = {Zhao, Zhihao and Ghoddoosian, Reza and Dwivedi, Isht and Agarwal, Nakul and Dariush, Behzad}, title = {Pose-Aware Weakly-Supervised Action Segmentation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {97-107} }
TLAC: Two-stage LMM Augmented CLIP for Zero-Shot Classification-
[pdf]
[arXiv]
[bibtex]@InProceedings{Munir_2025_CVPR, author = {Munir, Ans and Qureshi, Faisal Z. and Khan, Muhammad Haris and Ali, Mohsen}, title = {TLAC: Two-stage LMM Augmented CLIP for Zero-Shot Classification}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {159-169} }
Improving multimodal hateful meme detection exploiting LMM-generated knowledge-
[pdf]
[arXiv]
[bibtex]@InProceedings{Tzelepi_2025_CVPR, author = {Tzelepi, Maria and Mezaris, Vasileios}, title = {Improving multimodal hateful meme detection exploiting LMM-generated knowledge}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {202-211} }
Skin Lesion Classification Using Dermoscopic Images and Clinical Metadata: Insights from Multimodal Models-
[pdf]
[bibtex]@InProceedings{Ahammed_2025_CVPR, author = {Ahammed, Sakib and Cui, Xia and Lu, Wenqi and Yap, Moi Hoon}, title = {Skin Lesion Classification Using Dermoscopic Images and Clinical Metadata: Insights from Multimodal Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {222-230} }
MVCM: Enhancing Multi-View and Cross-Modality Alignment for Medical Visual Question Answering and Medical Image-Text Retrieval-
[pdf]
[supp]
[bibtex]@InProceedings{Zou_2025_CVPR, author = {Zou, Yuanhao and Yin, Zhaozheng}, title = {MVCM: Enhancing Multi-View and Cross-Modality Alignment for Medical Visual Question Answering and Medical Image-Text Retrieval}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {180-190} }
Transformer-Based Lung Infection Severity Prediction with Cross Attention and Conditional TransMix Augmentation-
[pdf]
[bibtex]@InProceedings{Slika_2025_CVPR, author = {Slika, Bouthaina and Dornaika, Fadi and Bougourzi, Fares and Hammoudi, Karim}, title = {Transformer-Based Lung Infection Severity Prediction with Cross Attention and Conditional TransMix Augmentation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {212-221} }
QID: Efficient Query-Informed ViTs in Data-Scarce Regimes for OCR-free Document Understanding-
[pdf]
[supp]
[bibtex]@InProceedings{Le_2025_CVPR, author = {Le, Binh and Xu, Shaoyuan and Fu, Jinmiao and Huang, Zhishen and Li, Moyan and Guo, Yanhui and Li, Hongdong and Ramasinghe, Sameera and Wang, Bryan}, title = {QID: Efficient Query-Informed ViTs in Data-Scarce Regimes for OCR-free Document Understanding}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {86-96} }
TRISHUL: Towards Region Identification and Screen Hierarchy Understanding for Large VLM based GUI Agents-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Singh_2025_CVPR, author = {Singh, Kunal and Singh, Shreyas and Khanna, Mukund}, title = {TRISHUL: Towards Region Identification and Screen Hierarchy Understanding for Large VLM based GUI Agents}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {170-179} }
Compositional Image-Text Matching and Retrieval by Grounding Entities-
[pdf]
[arXiv]
[bibtex]@InProceedings{Vongala_2025_CVPR, author = {Vongala, Madhukar Reddy and Srivastava, Saurabh and Kosecka, Jana}, title = {Compositional Image-Text Matching and Retrieval by Grounding Entities}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {241-250} }
Online Gaussian Test-Time Adaptation of Vision-Language Models.-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Fuchs_2025_CVPR, author = {Fuchs, Cl\'ement and Zanella, Maxime and De Vleeschouwer, Christophe}, title = {Online Gaussian Test-Time Adaptation of Vision-Language Models.}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {128-137} }
Vocabulary-free few-shot learning for vision-language models-
[pdf]
[arXiv]
[bibtex]@InProceedings{Zanella_2025_CVPR, author = {Zanella, Maxime and Fuchs, Cl\'ement and Ben Ayed, Ismail and De Vleeschouwer, Christophe}, title = {Vocabulary-free few-shot learning for vision-language models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {149-158} }
Exploring Missing Modality in Multimodal Egocentric Datasets-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ramazanova_2025_CVPR, author = {Ramazanova, Merey and Pardo, Alejandro and Alwassel, Humam and Ghanem, Bernard}, title = {Exploring Missing Modality in Multimodal Egocentric Datasets}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {75-85} }
ICT-QA: Question Answering over Multi-modal Contexts including Image, Chart, and Text Modalities-
[pdf]
[supp]
[bibtex]@InProceedings{Jang_2025_CVPR, author = {Jang, Youngrok and Kong, Hyesoo and Kim, Gyeonghun and Lee, Yejin and Choi, Jungkyu and Bae, Kyunghoon}, title = {ICT-QA: Question Answering over Multi-modal Contexts including Image, Chart, and Text Modalities}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {138-148} }
SplatTouch: Explicit 3D Representation Binding Vision and Touch-
[pdf]
[bibtex]@InProceedings{Stefani_2025_CVPR, author = {Stefani, Antonio Luigi and Bisagno, Niccol\`o and Conci, Nicola and De Natale, Francesco}, title = {SplatTouch: Explicit 3D Representation Binding Vision and Touch}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {118-127} }
LVP-CLIP: Revisiting CLIP for Continual Learning with Label Vector Pool-
[pdf]
[supp]
[bibtex]@InProceedings{Ma_2025_CVPR, author = {Ma, Yue and Ren, Huantao and Wang, Boyu and Jin, Jingang and Velipasalar, Senem and Qiu, Qinru}, title = {LVP-CLIP: Revisiting CLIP for Continual Learning with Label Vector Pool}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {231-240} }