9th Multimodal Learning and Applications Workshop
Non-Contrastive Vision-Language Learning with Predictive Embedding Alignment-
[pdf]
[arXiv]
[bibtex]@InProceedings{Kuhn_2026_CVPR, author = {Kuhn, Lukas and Serra, Giuseppe and Buettner, Florian}, title = {Non-Contrastive Vision-Language Learning with Predictive Embedding Alignment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7494-7502} }
Multi-Scale Encoding, Gated Fusion, and Temporal Context for Lightweight Wearable Stress Detection-
[pdf]
[bibtex]@InProceedings{Ali_2026_CVPR, author = {Ali, Mukhtiar and Pack, Chulwoo}, title = {Multi-Scale Encoding, Gated Fusion, and Temporal Context for Lightweight Wearable Stress Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7454-7462} }
Responses Fall Short of Understanding: Revealing the Gap between Internal Representations and Responses in Visual Document Understanding-
[pdf]
[arXiv]
[bibtex]@InProceedings{Kawasaki_2026_CVPR, author = {Kawasaki, Haruka and Tanaka, Ryota and Nishida, Kyosuke}, title = {Responses Fall Short of Understanding: Revealing the Gap between Internal Representations and Responses in Visual Document Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7483-7493} }
Enhancing Vision Text Compression in Vision Language Models via Adaptive Rendering-
[pdf]
[bibtex]@InProceedings{Jenq_2026_CVPR, author = {Jenq, Janet and Shen, Hongda}, title = {Enhancing Vision Text Compression in Vision Language Models via Adaptive Rendering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7473-7482} }
ViFiCon: Vision and Wireless Association Via Self-Supervised Contrastive Learning-
[pdf]
[arXiv]
[bibtex]@InProceedings{Meegan_2026_CVPR, author = {Meegan, Nicholas and Liu, Hansi and Cao, Bryan Bo and Alali, Abrar and Dana, Kristin and Gruteser, Marco and Jain, Shubham and Ashok, Ashwin}, title = {ViFiCon: Vision and Wireless Association Via Self-Supervised Contrastive Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7503-7513} }
What Makes Multimodal Meme Classification Reliable? A Large-Scale Controlled Study of Fusion, Encoders, and Modality Reliance-
[pdf]
[bibtex]@InProceedings{Sharma_2026_CVPR, author = {Sharma, Akshit and Patil, Prashant W.}, title = {What Makes Multimodal Meme Classification Reliable? A Large-Scale Controlled Study of Fusion, Encoders, and Modality Reliance}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7556-7565} }
Consistent Yet Wrong: Evidence Insensitivity in Spatial Vision-Language Models-
[pdf]
[supp]
[bibtex]@InProceedings{Bhat_2026_CVPR, author = {Bhat, S Divakar and Yamasaki, Toshihiko}, title = {Consistent Yet Wrong: Evidence Insensitivity in Spatial Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7463-7472} }
LUNA-Seg: Learning Language-Guided Low-Rank Adaptation for Night-time Semantic Segmentation-
[pdf]
[bibtex]@InProceedings{Zheng_2026_CVPR, author = {Zheng, Zhihao and Yao, Zhen and Chuah, Mooi Choo}, title = {LUNA-Seg: Learning Language-Guided Low-Rank Adaptation for Night-time Semantic Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7586-7595} }
VidText: Towards Comprehensive Evaluation for Video Text Understanding-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2026_CVPR, author = {Yang, Zhoufaran and Shu, Yan and Wang, Jing and Yang, Zhifei and Zhang, Yan and Yuan, Huaying and Li, Yu and Lu, Keyang and Zeng, Gangyan and Liu, Shaohui and Zhou, Yu and Sebe, Nicu}, title = {VidText: Towards Comprehensive Evaluation for Video Text Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7575-7585} }
Bridging the Granularity Gap: Object-Centric Masking for Contextual Visual Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Zhong_2026_CVPR, author = {Zhong, Jike and Lai, Yuxiang and Psounis, Konstantinos}, title = {Bridging the Granularity Gap: Object-Centric Masking for Contextual Visual Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7596-7606} }
DGM4+: Benchmarking Global Foreground-Background Inconsistencies in Multi-modal Media-
[pdf]
[bibtex]@InProceedings{Singh_2026_CVPR, author = {Singh, Gagandeep and Amarasinghe, Samudi and Singh, Priyanka}, title = {DGM4+: Benchmarking Global Foreground-Background Inconsistencies in Multi-modal Media}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7566-7574} }
Probabilistic Multimodal Learning with Bayesian Disagreements-
[pdf]
[supp]
[bibtex]@InProceedings{Sastry_2026_CVPR, author = {Sastry, Srikumar and Choudhury, Anustup and Madala, Pavani and Su, Guan-Ming}, title = {Probabilistic Multimodal Learning with Bayesian Disagreements}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7546-7555} }
Segmenting Collision Sound Sources in Egocentric Video-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Parida_2026_CVPR, author = {Parida, Kranti Kumar and Emara, Omar and Doughty, Hazel and Damen, Dima}, title = {Segmenting Collision Sound Sources in Egocentric Video}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7524-7534} }
DocRevive: A Unified Pipeline for Document Text Restoration-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Purkayastha_2026_CVPR, author = {Purkayastha, Kunal and Banerjee, Ayan and Llados, Josep and Pal, Umapada}, title = {DocRevive: A Unified Pipeline for Document Text Restoration}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7535-7545} }
PRISM: Physics-Informed Multimodal Learning for Battery State-of-Health Estimation from Thermal Imaging and Time-Series Signals-
[pdf]
[bibtex]@InProceedings{Musallam_2026_CVPR, author = {Musallam, Hazem Adel and Ghorbel, Enjie and Hicsonmez, Samet and Mamouri, Lakhdar and Mesbahi, Tedjani and Aouada, Djamila}, title = {PRISM: Physics-Informed Multimodal Learning for Battery State-of-Health Estimation from Thermal Imaging and Time-Series Signals}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7514-7523} }

