6th Multi-Modal Learning and Applications Workshop


Multi Event Localization by Audio-Visual Fusion With Omnidirectional Camera and Microphone Array
Wenru Zheng,
Ryota Yoshihashi,
Rei Kawakami,
Ikuro Sato,
Asako Kanezaki
[pdf]
[bibtex]
@InProceedings{Zheng_2023_CVPR, author = {Zheng, Wenru and Yoshihashi, Ryota and Kawakami, Rei and Sato, Ikuro and Kanezaki, Asako}, title = {Multi Event Localization by Audio-Visual Fusion With Omnidirectional Camera and Microphone Array}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2566-2574} }

Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval
Jae Myung Kim,
A. Sophia Koepke,
Cordelia Schmid,
Zeynep Akata
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Kim_2023_CVPR, author = {Kim, Jae Myung and Koepke, A. Sophia and Schmid, Cordelia and Akata, Zeynep}, title = {Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2585-2595} }

Adapting Grounded Visual Question Answering Models to Low Resource Languages
Ying Wang,
Jonas Pfeiffer,
Nicolas Carion,
Yann LeCun,
Aishwarya Kamath
[pdf]
[bibtex]
@InProceedings{Wang_2023_CVPR, author = {Wang, Ying and Pfeiffer, Jonas and Carion, Nicolas and LeCun, Yann and Kamath, Aishwarya}, title = {Adapting Grounded Visual Question Answering Models to Low Resource Languages}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2596-2605} }

The MONET Dataset: Multimodal Drone Thermal Dataset Recorded in Rural Scenarios
Luigi Riz,
Andrea Caraffa,
Matteo Bortolon,
Mohamed Lamine Mekhalfi,
Davide Boscaini,
André Moura,
José Antunes,
André Dias,
Hugo Silva,
Andreas Leonidou,
Christos Constantinides,
Christos Keleshis,
Dante Abate,
Fabio Poiesi
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Riz_2023_CVPR, author = {Riz, Luigi and Caraffa, Andrea and Bortolon, Matteo and Mekhalfi, Mohamed Lamine and Boscaini, Davide and Moura, Andr\'e and Antunes, Jos\'e and Dias, Andr\'e and Silva, Hugo and Leonidou, Andreas and Constantinides, Christos and Keleshis, Christos and Abate, Dante and Poiesi, Fabio}, title = {The MONET Dataset: Multimodal Drone Thermal Dataset Recorded in Rural Scenarios}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2546-2554} }

SSGVS: Semantic Scene Graph-to-Video Synthesis
Yuren Cong,
Jinhui Yi,
Bodo Rosenhahn,
Michael Ying Yang
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Cong_2023_CVPR, author = {Cong, Yuren and Yi, Jinhui and Rosenhahn, Bodo and Yang, Michael Ying}, title = {SSGVS: Semantic Scene Graph-to-Video Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2555-2565} }

TFRGAN: Leveraging Text Information for Blind Face Restoration With Extreme Degradation
Chengxing Xie,
Qian Ning,
Weisheng Dong,
Guangming Shi
[pdf]
[bibtex]
@InProceedings{Xie_2023_CVPR, author = {Xie, Chengxing and Ning, Qian and Dong, Weisheng and Shi, Guangming}, title = {TFRGAN: Leveraging Text Information for Blind Face Restoration With Extreme Degradation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2535-2545} }

Dynamic Multimodal Fusion
Zihui Xue,
Radu Marculescu
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Xue_2023_CVPR, author = {Xue, Zihui and Marculescu, Radu}, title = {Dynamic Multimodal Fusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2575-2584} }

SEM-POS: Grammatically and Semantically Correct Video Captioning
Asmar Nadeem,
Adrian Hilton,
Robert Dawes,
Graham Thomas,
Armin Mustafa
[pdf]
[bibtex]
@InProceedings{Nadeem_2023_CVPR, author = {Nadeem, Asmar and Hilton, Adrian and Dawes, Robert and Thomas, Graham and Mustafa, Armin}, title = {SEM-POS: Grammatically and Semantically Correct Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2606-2616} }

Robust Multiview Multimodal Driver Monitoring System Using Masked Multi-Head Self-Attention
Yiming Ma,
Victor Sanchez,
Soodeh Nikan,
Devesh Upadhyay,
Bhushan Atote,
Tanaya Guha
[pdf] [arXiv]
[bibtex]
@InProceedings{Ma_2023_CVPR, author = {Ma, Yiming and Sanchez, Victor and Nikan, Soodeh and Upadhyay, Devesh and Atote, Bhushan and Guha, Tanaya}, title = {Robust Multiview Multimodal Driver Monitoring System Using Masked Multi-Head Self-Attention}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2023}, pages = {2617-2625} }