9th Multimodal Learning and Applications Workshop


Non-Contrastive Vision-Language Learning with Predictive Embedding Alignment
Lukas Kuhn,
Giuseppe Serra,
Florian Buettner
[pdf] [arXiv]
[bibtex]
@InProceedings{Kuhn_2026_CVPR, author = {Kuhn, Lukas and Serra, Giuseppe and Buettner, Florian}, title = {Non-Contrastive Vision-Language Learning with Predictive Embedding Alignment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7494-7502} }

Multi-Scale Encoding, Gated Fusion, and Temporal Context for Lightweight Wearable Stress Detection
Mukhtiar Ali,
Chulwoo Pack
[pdf]
[bibtex]
@InProceedings{Ali_2026_CVPR, author = {Ali, Mukhtiar and Pack, Chulwoo}, title = {Multi-Scale Encoding, Gated Fusion, and Temporal Context for Lightweight Wearable Stress Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7454-7462} }

Responses Fall Short of Understanding: Revealing the Gap between Internal Representations and Responses in Visual Document Understanding
Haruka Kawasaki,
Ryota Tanaka,
Kyosuke Nishida
[pdf] [arXiv]
[bibtex]
@InProceedings{Kawasaki_2026_CVPR, author = {Kawasaki, Haruka and Tanaka, Ryota and Nishida, Kyosuke}, title = {Responses Fall Short of Understanding: Revealing the Gap between Internal Representations and Responses in Visual Document Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7483-7493} }

Enhancing Vision Text Compression in Vision Language Models via Adaptive Rendering
Janet Jenq,
Hongda Shen
[pdf]
[bibtex]
@InProceedings{Jenq_2026_CVPR, author = {Jenq, Janet and Shen, Hongda}, title = {Enhancing Vision Text Compression in Vision Language Models via Adaptive Rendering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7473-7482} }

ViFiCon: Vision and Wireless Association Via Self-Supervised Contrastive Learning
Nicholas Meegan,
Hansi Liu,
Bryan Bo Cao,
Abrar Alali,
Kristin Dana,
Marco Gruteser,
Shubham Jain,
Ashwin Ashok
[pdf] [arXiv]
[bibtex]
@InProceedings{Meegan_2026_CVPR, author = {Meegan, Nicholas and Liu, Hansi and Cao, Bryan Bo and Alali, Abrar and Dana, Kristin and Gruteser, Marco and Jain, Shubham and Ashok, Ashwin}, title = {ViFiCon: Vision and Wireless Association Via Self-Supervised Contrastive Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7503-7513} }

What Makes Multimodal Meme Classification Reliable? A Large-Scale Controlled Study of Fusion, Encoders, and Modality Reliance
Akshit Sharma,
Prashant W. Patil
[pdf]
[bibtex]
@InProceedings{Sharma_2026_CVPR, author = {Sharma, Akshit and Patil, Prashant W.}, title = {What Makes Multimodal Meme Classification Reliable? A Large-Scale Controlled Study of Fusion, Encoders, and Modality Reliance}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7556-7565} }

Consistent Yet Wrong: Evidence Insensitivity in Spatial Vision-Language Models
S Divakar Bhat,
Toshihiko Yamasaki
[pdf] [supp]
[bibtex]
@InProceedings{Bhat_2026_CVPR, author = {Bhat, S Divakar and Yamasaki, Toshihiko}, title = {Consistent Yet Wrong: Evidence Insensitivity in Spatial Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7463-7472} }

LUNA-Seg: Learning Language-Guided Low-Rank Adaptation for Night-time Semantic Segmentation
Zhihao Zheng,
Zhen Yao,
Mooi Choo Chuah
[pdf]
[bibtex]
@InProceedings{Zheng_2026_CVPR, author = {Zheng, Zhihao and Yao, Zhen and Chuah, Mooi Choo}, title = {LUNA-Seg: Learning Language-Guided Low-Rank Adaptation for Night-time Semantic Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7586-7595} }

VidText: Towards Comprehensive Evaluation for Video Text Understanding
Zhoufaran Yang,
Yan Shu,
Jing Wang,
Zhifei Yang,
Yan Zhang,
Huaying Yuan,
Yu Li,
Keyang Lu,
Gangyan Zeng,
Shaohui Liu,
Yu Zhou,
Nicu Sebe
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Yang_2026_CVPR, author = {Yang, Zhoufaran and Shu, Yan and Wang, Jing and Yang, Zhifei and Zhang, Yan and Yuan, Huaying and Li, Yu and Lu, Keyang and Zeng, Gangyan and Liu, Shaohui and Zhou, Yu and Sebe, Nicu}, title = {VidText: Towards Comprehensive Evaluation for Video Text Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7575-7585} }

Bridging the Granularity Gap: Object-Centric Masking for Contextual Visual Learning
Jike Zhong,
Yuxiang Lai,
Konstantinos Psounis
[pdf] [supp]
[bibtex]
@InProceedings{Zhong_2026_CVPR, author = {Zhong, Jike and Lai, Yuxiang and Psounis, Konstantinos}, title = {Bridging the Granularity Gap: Object-Centric Masking for Contextual Visual Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7596-7606} }

DGM4+: Benchmarking Global Foreground-Background Inconsistencies in Multi-modal Media
Gagandeep Singh,
Samudi Amarasinghe,
Priyanka Singh
[pdf]
[bibtex]
@InProceedings{Singh_2026_CVPR, author = {Singh, Gagandeep and Amarasinghe, Samudi and Singh, Priyanka}, title = {DGM4+: Benchmarking Global Foreground-Background Inconsistencies in Multi-modal Media}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7566-7574} }

Probabilistic Multimodal Learning with Bayesian Disagreements
Srikumar Sastry,
Anustup Choudhury,
Pavani Madala,
Guan-Ming Su
[pdf] [supp]
[bibtex]
@InProceedings{Sastry_2026_CVPR, author = {Sastry, Srikumar and Choudhury, Anustup and Madala, Pavani and Su, Guan-Ming}, title = {Probabilistic Multimodal Learning with Bayesian Disagreements}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7546-7555} }

Segmenting Collision Sound Sources in Egocentric Video
Kranti Kumar Parida,
Omar Emara,
Hazel Doughty,
Dima Damen
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Parida_2026_CVPR, author = {Parida, Kranti Kumar and Emara, Omar and Doughty, Hazel and Damen, Dima}, title = {Segmenting Collision Sound Sources in Egocentric Video}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7524-7534} }

DocRevive: A Unified Pipeline for Document Text Restoration
Kunal Purkayastha,
Ayan Banerjee,
Josep Llados,
Umapada Pal
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Purkayastha_2026_CVPR, author = {Purkayastha, Kunal and Banerjee, Ayan and Llados, Josep and Pal, Umapada}, title = {DocRevive: A Unified Pipeline for Document Text Restoration}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7535-7545} }

PRISM: Physics-Informed Multimodal Learning for Battery State-of-Health Estimation from Thermal Imaging and Time-Series Signals
Hazem Adel Musallam,
Enjie Ghorbel,
Samet Hicsonmez,
Lakhdar Mamouri,
Tedjani Mesbahi,
Djamila Aouada
[pdf]
[bibtex]
@InProceedings{Musallam_2026_CVPR, author = {Musallam, Hazem Adel and Ghorbel, Enjie and Hicsonmez, Samet and Mamouri, Lakhdar and Mesbahi, Tedjani and Aouada, Djamila}, title = {PRISM: Physics-Informed Multimodal Learning for Battery State-of-Health Estimation from Thermal Imaging and Time-Series Signals}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7514-7523} }