8th Multimodal Learning and Applications Workshop


Location-Free Scene Graph Generation
Ege Özsoy,
Felix Holm,
Chantal Pellegrini,
Tobias Czempiel,
Mahdi Saleh,
Nassir Navab,
Benjamin Busam
[pdf] [supp]
[bibtex]
@InProceedings{Ozsoy_2025_CVPR, author = {\"Ozsoy, Ege and Holm, Felix and Pellegrini, Chantal and Czempiel, Tobias and Saleh, Mahdi and Navab, Nassir and Busam, Benjamin}, title = {Location-Free Scene Graph Generation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {108-117} }

Multimodal Rationales for Explainable Visual Question Answering
Kun Li,
George Vosselman,
Michael Ying Yang
[pdf] [supp]
[bibtex]
@InProceedings{Li_2025_CVPR, author = {Li, Kun and Vosselman, George and Yang, Michael Ying}, title = {Multimodal Rationales for Explainable Visual Question Answering}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {191-201} }

Pose-Aware Weakly-Supervised Action Segmentation
Zhihao Zhao,
Reza Ghoddoosian,
Isht Dwivedi,
Nakul Agarwal,
Behzad Dariush
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Zhao_2025_CVPR, author = {Zhao, Zhihao and Ghoddoosian, Reza and Dwivedi, Isht and Agarwal, Nakul and Dariush, Behzad}, title = {Pose-Aware Weakly-Supervised Action Segmentation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {97-107} }

TLAC: Two-stage LMM Augmented CLIP for Zero-Shot Classification
Ans Munir,
Faisal Z. Qureshi,
Muhammad Haris Khan,
Mohsen Ali
[pdf] [arXiv]
[bibtex]
@InProceedings{Munir_2025_CVPR, author = {Munir, Ans and Qureshi, Faisal Z. and Khan, Muhammad Haris and Ali, Mohsen}, title = {TLAC: Two-stage LMM Augmented CLIP for Zero-Shot Classification}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {159-169} }

Improving multimodal hateful meme detection exploiting LMM-generated knowledge
Maria Tzelepi,
Vasileios Mezaris
[pdf] [arXiv]
[bibtex]
@InProceedings{Tzelepi_2025_CVPR, author = {Tzelepi, Maria and Mezaris, Vasileios}, title = {Improving multimodal hateful meme detection exploiting LMM-generated knowledge}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {202-211} }

Skin Lesion Classification Using Dermoscopic Images and Clinical Metadata: Insights from Multimodal Models
Sakib Ahammed,
Xia Cui,
Wenqi Lu,
Moi Hoon Yap
[pdf]
[bibtex]
@InProceedings{Ahammed_2025_CVPR, author = {Ahammed, Sakib and Cui, Xia and Lu, Wenqi and Yap, Moi Hoon}, title = {Skin Lesion Classification Using Dermoscopic Images and Clinical Metadata: Insights from Multimodal Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {222-230} }

MVCM: Enhancing Multi-View and Cross-Modality Alignment for Medical Visual Question Answering and Medical Image-Text Retrieval
Yuanhao Zou,
Zhaozheng Yin
[pdf] [supp]
[bibtex]
@InProceedings{Zou_2025_CVPR, author = {Zou, Yuanhao and Yin, Zhaozheng}, title = {MVCM: Enhancing Multi-View and Cross-Modality Alignment for Medical Visual Question Answering and Medical Image-Text Retrieval}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {180-190} }

Transformer-Based Lung Infection Severity Prediction with Cross Attention and Conditional TransMix Augmentation
Bouthaina Slika,
Fadi Dornaika,
Fares Bougourzi,
Karim Hammoudi
[pdf]
[bibtex]
@InProceedings{Slika_2025_CVPR, author = {Slika, Bouthaina and Dornaika, Fadi and Bougourzi, Fares and Hammoudi, Karim}, title = {Transformer-Based Lung Infection Severity Prediction with Cross Attention and Conditional TransMix Augmentation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {212-221} }

QID: Efficient Query-Informed ViTs in Data-Scarce Regimes for OCR-free Document Understanding
Binh Le,
Shaoyuan Xu,
Jinmiao Fu,
Zhishen Huang,
Moyan Li,
Yanhui Guo,
Hongdong Li,
Sameera Ramasinghe,
Bryan Wang
[pdf] [supp]
[bibtex]
@InProceedings{Le_2025_CVPR, author = {Le, Binh and Xu, Shaoyuan and Fu, Jinmiao and Huang, Zhishen and Li, Moyan and Guo, Yanhui and Li, Hongdong and Ramasinghe, Sameera and Wang, Bryan}, title = {QID: Efficient Query-Informed ViTs in Data-Scarce Regimes for OCR-free Document Understanding}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {86-96} }

TRISHUL: Towards Region Identification and Screen Hierarchy Understanding for Large VLM based GUI Agents
Kunal Singh,
Shreyas Singh,
Mukund Khanna
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Singh_2025_CVPR, author = {Singh, Kunal and Singh, Shreyas and Khanna, Mukund}, title = {TRISHUL: Towards Region Identification and Screen Hierarchy Understanding for Large VLM based GUI Agents}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {170-179} }

Compositional Image-Text Matching and Retrieval by Grounding Entities
Madhukar Reddy Vongala,
Saurabh Srivastava,
Jana Kosecka
[pdf] [arXiv]
[bibtex]
@InProceedings{Vongala_2025_CVPR, author = {Vongala, Madhukar Reddy and Srivastava, Saurabh and Kosecka, Jana}, title = {Compositional Image-Text Matching and Retrieval by Grounding Entities}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {241-250} }

Online Gaussian Test-Time Adaptation of Vision-Language Models.
Clément Fuchs,
Maxime Zanella,
Christophe De Vleeschouwer
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Fuchs_2025_CVPR, author = {Fuchs, Cl\'ement and Zanella, Maxime and De Vleeschouwer, Christophe}, title = {Online Gaussian Test-Time Adaptation of Vision-Language Models.}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {128-137} }

Vocabulary-free few-shot learning for vision-language models
Maxime Zanella,
Clément Fuchs,
Ismail Ben Ayed,
Christophe De Vleeschouwer
[pdf] [arXiv]
[bibtex]
@InProceedings{Zanella_2025_CVPR, author = {Zanella, Maxime and Fuchs, Cl\'ement and Ben Ayed, Ismail and De Vleeschouwer, Christophe}, title = {Vocabulary-free few-shot learning for vision-language models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {149-158} }

Exploring Missing Modality in Multimodal Egocentric Datasets
Merey Ramazanova,
Alejandro Pardo,
Humam Alwassel,
Bernard Ghanem
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Ramazanova_2025_CVPR, author = {Ramazanova, Merey and Pardo, Alejandro and Alwassel, Humam and Ghanem, Bernard}, title = {Exploring Missing Modality in Multimodal Egocentric Datasets}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {75-85} }

ICT-QA: Question Answering over Multi-modal Contexts including Image, Chart, and Text Modalities
Youngrok Jang,
Hyesoo Kong,
Gyeonghun Kim,
Yejin Lee,
Jungkyu Choi,
Kyunghoon Bae
[pdf] [supp]
[bibtex]
@InProceedings{Jang_2025_CVPR, author = {Jang, Youngrok and Kong, Hyesoo and Kim, Gyeonghun and Lee, Yejin and Choi, Jungkyu and Bae, Kyunghoon}, title = {ICT-QA: Question Answering over Multi-modal Contexts including Image, Chart, and Text Modalities}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {138-148} }

SplatTouch: Explicit 3D Representation Binding Vision and Touch
Antonio Luigi Stefani,
Niccolò Bisagno,
Nicola Conci,
Francesco De Natale
[pdf]
[bibtex]
@InProceedings{Stefani_2025_CVPR, author = {Stefani, Antonio Luigi and Bisagno, Niccol\`o and Conci, Nicola and De Natale, Francesco}, title = {SplatTouch: Explicit 3D Representation Binding Vision and Touch}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {118-127} }

LVP-CLIP: Revisiting CLIP for Continual Learning with Label Vector Pool
Yue Ma,
Huantao Ren,
Boyu Wang,
Jingang Jin,
Senem Velipasalar,
Qinru Qiu
[pdf] [supp]
[bibtex]
@InProceedings{Ma_2025_CVPR, author = {Ma, Yue and Ren, Huantao and Wang, Boyu and Jin, Jingang and Velipasalar, Senem and Qiu, Qinru}, title = {LVP-CLIP: Revisiting CLIP for Continual Learning with Label Vector Pool}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {231-240} }