7th MUltimodal Learning and Applications


InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation
Ankan Deria,
Komal Kumar,
Snehashis Chakraborty,
Dwarikanath Mahapatra,
Sudipta Roy
[pdf] [supp]
[bibtex]
@InProceedings{Deria_2024_CVPR, author = {Deria, Ankan and Kumar, Komal and Chakraborty, Snehashis and Mahapatra, Dwarikanath and Roy, Sudipta}, title = {InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2028-2038} }

ZInD-Tell: Towards Translating Indoor Panoramas into Descriptions
Tonmoay Deb,
Lichen Wang,
Zachary Bessinger,
Naji Khosravan,
Eric Penner,
Sing Bing Kang
[pdf] [supp]
[bibtex]
@InProceedings{Deb_2024_CVPR, author = {Deb, Tonmoay and Wang, Lichen and Bessinger, Zachary and Khosravan, Naji and Penner, Eric and Kang, Sing Bing}, title = {ZInD-Tell: Towards Translating Indoor Panoramas into Descriptions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2050-2059} }

Exploring the Role of Audio in Video Captioning
Yuhan Shen,
Linjie Yang,
Longyin Wen,
Haichao Yu,
Ehsan Elhamifar,
Heng Wang
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Shen_2024_CVPR, author = {Shen, Yuhan and Yang, Linjie and Wen, Longyin and Yu, Haichao and Elhamifar, Ehsan and Wang, Heng}, title = {Exploring the Role of Audio in Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2090-2100} }

Multi-Modal Fusion of Event and RGB for Monocular Depth Estimation Using a Unified Transformer-based Architecture
Anusha Devulapally,
Md Fahim Faysal Khan,
Siddharth Advani,
Vijaykrishnan Narayanan
[pdf] [supp]
[bibtex]
@InProceedings{Devulapally_2024_CVPR, author = {Devulapally, Anusha and Khan, Md Fahim Faysal and Advani, Siddharth and Narayanan, Vijaykrishnan}, title = {Multi-Modal Fusion of Event and RGB for Monocular Depth Estimation Using a Unified Transformer-based Architecture}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2081-2089} }

LAformer: Trajectory Prediction for Autonomous Driving with Lane-Aware Scene Constraints
Mengmeng Liu,
Hao Cheng,
Lin Chen,
Hellward Broszio,
Jiangtao Li,
Runjiang Zhao,
Monika Sester,
Michael Ying Yang
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Liu_2024_CVPR, author = {Liu, Mengmeng and Cheng, Hao and Chen, Lin and Broszio, Hellward and Li, Jiangtao and Zhao, Runjiang and Sester, Monika and Yang, Michael Ying}, title = {LAformer: Trajectory Prediction for Autonomous Driving with Lane-Aware Scene Constraints}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2039-2049} }

Cross-Modal Fusion and Attention Mechanism for Weakly Supervised Video Anomaly Detection
Ayush Ghadiya,
Purbayan Kar,
Vishal Chudasama,
Pankaj Wasnik
[pdf]
[bibtex]
@InProceedings{Ghadiya_2024_CVPR, author = {Ghadiya, Ayush and Kar, Purbayan and Chudasama, Vishal and Wasnik, Pankaj}, title = {Cross-Modal Fusion and Attention Mechanism for Weakly Supervised Video Anomaly Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1965-1974} }

VMCML: Video and Music Matching via Cross-Modality Lifting
Yi-Shan Lee,
Wei-Cheng Tseng,
Fu-En Wang,
Min Sun
[pdf] [arXiv]
[bibtex]
@InProceedings{Lee_2024_CVPR, author = {Lee, Yi-Shan and Tseng, Wei-Cheng and Wang, Fu-En and Sun, Min}, title = {VMCML: Video and Music Matching via Cross-Modality Lifting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2060-2069} }

AIGeN: An Adversarial Approach for Instruction Generation in VLN
Niyati Rawal,
Roberto Bigazzi,
Lorenzo Baraldi,
Rita Cucchiara
[pdf] [arXiv]
[bibtex]
@InProceedings{Rawal_2024_CVPR, author = {Rawal, Niyati and Bigazzi, Roberto and Baraldi, Lorenzo and Cucchiara, Rita}, title = {AIGeN: An Adversarial Approach for Instruction Generation in VLN}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2070-2080} }

De-noised Vision-language Fusion Guided by Visual Cues for E-commerce Product Search
Zhizhang Hu,
Shasha Li,
Ming Du,
Arnab Dhua,
Douglas Gray
[pdf]
[bibtex]
@InProceedings{Hu_2024_CVPR, author = {Hu, Zhizhang and Li, Shasha and Du, Ming and Dhua, Arnab and Gray, Douglas}, title = {De-noised Vision-language Fusion Guided by Visual Cues for E-commerce Product Search}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1986-1996} }

Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning
Zaber Ibn Abdul Hakim,
Najibul Haque Sarker,
Rahul Pratap Singh,
Bishmoy Paul,
Ali Dabouei,
Min Xu
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Ibn_Abdul_Hakim_2024_CVPR, author = {Ibn Abdul Hakim, Zaber and Sarker, Najibul Haque and Singh, Rahul Pratap and Paul, Bishmoy and Dabouei, Ali and Xu, Min}, title = {Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1975-1985} }

RGB-D Cube R-CNN: 3D Object Detection with Selective Modality Dropout
Jens Piekenbrinck,
Alexander Hermans,
Narunas Vaskevicius,
Timm Linder,
Bastian Leibe
[pdf] [supp]
[bibtex]
@InProceedings{Piekenbrinck_2024_CVPR, author = {Piekenbrinck, Jens and Hermans, Alexander and Vaskevicius, Narunas and Linder, Timm and Leibe, Bastian}, title = {RGB-D Cube R-CNN: 3D Object Detection with Selective Modality Dropout}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1997-2006} }

Listen Then See: Video Alignment with Speaker Attention
Aviral Agrawal,
Carlos Mateo Samudio Lezcano,
Iqui Balam Heredia-Marin,
Prabhdeep Singh Sethi
[pdf] [arXiv]
[bibtex]
@InProceedings{Agrawal_2024_CVPR, author = {Agrawal, Aviral and Lezcano, Carlos Mateo Samudio and Heredia-Marin, Iqui Balam and Sethi, Prabhdeep Singh}, title = {Listen Then See: Video Alignment with Speaker Attention}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2018-2027} }

Multimodal Understanding of Memes with Fair Explanations
Yang Zhong,
Bhiman Kumar Baghel
[pdf]
[bibtex]
@InProceedings{Zhong_2024_CVPR, author = {Zhong, Yang and Baghel, Bhiman Kumar}, title = {Multimodal Understanding of Memes with Fair Explanations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2007-2017} }