CVPR 2025 Open Access Repository

8th Multimodal Learning and Applications Workshop

Location-Free Scene Graph Generation: Ege Özsoy,

Felix Holm,

Chantal Pellegrini,

Tobias Czempiel,

Mahdi Saleh,

Nassir Navab,

Benjamin Busam; [pdf] [supp]
[bibtex]
@InProceedings{Ozsoy_2025_CVPR, author = {\"Ozsoy, Ege and Holm, Felix and Pellegrini, Chantal and Czempiel, Tobias and Saleh, Mahdi and Navab, Nassir and Busam, Benjamin}, title = {Location-Free Scene Graph Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {108-117} }
Multimodal Rationales for Explainable Visual Question Answering: Kun Li,

George Vosselman,

Michael Ying Yang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Li_2025_CVPR, author = {Li, Kun and Vosselman, George and Yang, Michael Ying}, title = {Multimodal Rationales for Explainable Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {191-201} }
Pose-Aware Weakly-Supervised Action Segmentation: Zhihao Zhao,

Reza Ghoddoosian,

Isht Dwivedi,

Nakul Agarwal,

Behzad Dariush; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Zhao_2025_CVPR, author = {Zhao, Zhihao and Ghoddoosian, Reza and Dwivedi, Isht and Agarwal, Nakul and Dariush, Behzad}, title = {Pose-Aware Weakly-Supervised Action Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {97-107} }
TLAC: Two-stage LMM Augmented CLIP for Zero-Shot Classification: Ans Munir,

Faisal Z. Qureshi,

Muhammad Haris Khan,

Mohsen Ali; [pdf] [arXiv]
[bibtex]
@InProceedings{Munir_2025_CVPR, author = {Munir, Ans and Qureshi, Faisal Z. and Khan, Muhammad Haris and Ali, Mohsen}, title = {TLAC: Two-stage LMM Augmented CLIP for Zero-Shot Classification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {159-169} }
Improving multimodal hateful meme detection exploiting LMM-generated knowledge: Maria Tzelepi,

Vasileios Mezaris; [pdf] [arXiv]
[bibtex]
@InProceedings{Tzelepi_2025_CVPR, author = {Tzelepi, Maria and Mezaris, Vasileios}, title = {Improving multimodal hateful meme detection exploiting LMM-generated knowledge}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {202-211} }
Skin Lesion Classification Using Dermoscopic Images and Clinical Metadata: Insights from Multimodal Models: Sakib Ahammed,

Xia Cui,

Wenqi Lu,

Moi Hoon Yap; [pdf]
[bibtex]
@InProceedings{Ahammed_2025_CVPR, author = {Ahammed, Sakib and Cui, Xia and Lu, Wenqi and Yap, Moi Hoon}, title = {Skin Lesion Classification Using Dermoscopic Images and Clinical Metadata: Insights from Multimodal Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {222-230} }
MVCM: Enhancing Multi-View and Cross-Modality Alignment for Medical Visual Question Answering and Medical Image-Text Retrieval: Yuanhao Zou,

Zhaozheng Yin; [pdf] [supp]
[bibtex]
@InProceedings{Zou_2025_CVPR, author = {Zou, Yuanhao and Yin, Zhaozheng}, title = {MVCM: Enhancing Multi-View and Cross-Modality Alignment for Medical Visual Question Answering and Medical Image-Text Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {180-190} }
Transformer-Based Lung Infection Severity Prediction with Cross Attention and Conditional TransMix Augmentation: Bouthaina Slika,

Fadi Dornaika,

Fares Bougourzi,

Karim Hammoudi; [pdf]
[bibtex]
@InProceedings{Slika_2025_CVPR, author = {Slika, Bouthaina and Dornaika, Fadi and Bougourzi, Fares and Hammoudi, Karim}, title = {Transformer-Based Lung Infection Severity Prediction with Cross Attention and Conditional TransMix Augmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {212-221} }
QID: Efficient Query-Informed ViTs in Data-Scarce Regimes for OCR-free Document Understanding: Binh Le,

Shaoyuan Xu,

Jinmiao Fu,

Zhishen Huang,

Moyan Li,

Yanhui Guo,

Hongdong Li,

Sameera Ramasinghe,

Bryan Wang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Le_2025_CVPR, author = {Le, Binh and Xu, Shaoyuan and Fu, Jinmiao and Huang, Zhishen and Li, Moyan and Guo, Yanhui and Li, Hongdong and Ramasinghe, Sameera and Wang, Bryan}, title = {QID: Efficient Query-Informed ViTs in Data-Scarce Regimes for OCR-free Document Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {86-96} }
TRISHUL: Towards Region Identification and Screen Hierarchy Understanding for Large VLM based GUI Agents: Kunal Singh,

Shreyas Singh,

Mukund Khanna; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Singh_2025_CVPR, author = {Singh, Kunal and Singh, Shreyas and Khanna, Mukund}, title = {TRISHUL: Towards Region Identification and Screen Hierarchy Understanding for Large VLM based GUI Agents}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {170-179} }
Compositional Image-Text Matching and Retrieval by Grounding Entities: Madhukar Reddy Vongala,

Saurabh Srivastava,

Jana Kosecka; [pdf] [arXiv]
[bibtex]
@InProceedings{Vongala_2025_CVPR, author = {Vongala, Madhukar Reddy and Srivastava, Saurabh and Kosecka, Jana}, title = {Compositional Image-Text Matching and Retrieval by Grounding Entities}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {241-250} }
Online Gaussian Test-Time Adaptation of Vision-Language Models.: Clément Fuchs,

Maxime Zanella,

Christophe De Vleeschouwer; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Fuchs_2025_CVPR, author = {Fuchs, Cl\'ement and Zanella, Maxime and De Vleeschouwer, Christophe}, title = {Online Gaussian Test-Time Adaptation of Vision-Language Models.}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {128-137} }
Vocabulary-free few-shot learning for vision-language models: Maxime Zanella,

Clément Fuchs,

Ismail Ben Ayed,

Christophe De Vleeschouwer; [pdf] [arXiv]
[bibtex]
@InProceedings{Zanella_2025_CVPR, author = {Zanella, Maxime and Fuchs, Cl\'ement and Ben Ayed, Ismail and De Vleeschouwer, Christophe}, title = {Vocabulary-free few-shot learning for vision-language models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {149-158} }
Exploring Missing Modality in Multimodal Egocentric Datasets: Merey Ramazanova,

Alejandro Pardo,

Humam Alwassel,

Bernard Ghanem; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Ramazanova_2025_CVPR, author = {Ramazanova, Merey and Pardo, Alejandro and Alwassel, Humam and Ghanem, Bernard}, title = {Exploring Missing Modality in Multimodal Egocentric Datasets}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {75-85} }
ICT-QA: Question Answering over Multi-modal Contexts including Image, Chart, and Text Modalities: Youngrok Jang,

Hyesoo Kong,

Gyeonghun Kim,

Yejin Lee,

Jungkyu Choi,

Kyunghoon Bae; [pdf] [supp]
[bibtex]
@InProceedings{Jang_2025_CVPR, author = {Jang, Youngrok and Kong, Hyesoo and Kim, Gyeonghun and Lee, Yejin and Choi, Jungkyu and Bae, Kyunghoon}, title = {ICT-QA: Question Answering over Multi-modal Contexts including Image, Chart, and Text Modalities}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {138-148} }
SplatTouch: Explicit 3D Representation Binding Vision and Touch: Antonio Luigi Stefani,

Niccolò Bisagno,

Nicola Conci,

Francesco De Natale; [pdf]
[bibtex]
@InProceedings{Stefani_2025_CVPR, author = {Stefani, Antonio Luigi and Bisagno, Niccol\`o and Conci, Nicola and De Natale, Francesco}, title = {SplatTouch: Explicit 3D Representation Binding Vision and Touch}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {118-127} }
LVP-CLIP: Revisiting CLIP for Continual Learning with Label Vector Pool: Yue Ma,

Huantao Ren,

Boyu Wang,

Jingang Jin,

Senem Velipasalar,

Qinru Qiu; [pdf] [supp]
[bibtex]
@InProceedings{Ma_2025_CVPR, author = {Ma, Yue and Ren, Huantao and Wang, Boyu and Jin, Jingang and Velipasalar, Senem and Qiu, Qinru}, title = {LVP-CLIP: Revisiting CLIP for Continual Learning with Label Vector Pool}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {231-240} }