CVPR 2024 Open Access Repository

7th MUltimodal Learning and Applications

InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation: Ankan Deria,

Komal Kumar,

Snehashis Chakraborty,

Dwarikanath Mahapatra,

Sudipta Roy; [pdf] [supp]
[bibtex]
@InProceedings{Deria_2024_CVPR, author = {Deria, Ankan and Kumar, Komal and Chakraborty, Snehashis and Mahapatra, Dwarikanath and Roy, Sudipta}, title = {InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2028-2038} }
ZInD-Tell: Towards Translating Indoor Panoramas into Descriptions: Tonmoay Deb,

Lichen Wang,

Zachary Bessinger,

Naji Khosravan,

Eric Penner,

Sing Bing Kang; [pdf] [supp]
[bibtex]
@InProceedings{Deb_2024_CVPR, author = {Deb, Tonmoay and Wang, Lichen and Bessinger, Zachary and Khosravan, Naji and Penner, Eric and Kang, Sing Bing}, title = {ZInD-Tell: Towards Translating Indoor Panoramas into Descriptions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2050-2059} }
Exploring the Role of Audio in Video Captioning: Yuhan Shen,

Linjie Yang,

Longyin Wen,

Haichao Yu,

Ehsan Elhamifar,

Heng Wang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Shen_2024_CVPR, author = {Shen, Yuhan and Yang, Linjie and Wen, Longyin and Yu, Haichao and Elhamifar, Ehsan and Wang, Heng}, title = {Exploring the Role of Audio in Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2090-2100} }
Multi-Modal Fusion of Event and RGB for Monocular Depth Estimation Using a Unified Transformer-based Architecture: Anusha Devulapally,

Md Fahim Faysal Khan,

Siddharth Advani,

Vijaykrishnan Narayanan; [pdf] [supp]
[bibtex]
@InProceedings{Devulapally_2024_CVPR, author = {Devulapally, Anusha and Khan, Md Fahim Faysal and Advani, Siddharth and Narayanan, Vijaykrishnan}, title = {Multi-Modal Fusion of Event and RGB for Monocular Depth Estimation Using a Unified Transformer-based Architecture}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2081-2089} }
LAformer: Trajectory Prediction for Autonomous Driving with Lane-Aware Scene Constraints: Mengmeng Liu,

Hao Cheng,

Lin Chen,

Hellward Broszio,

Jiangtao Li,

Runjiang Zhao,

Monika Sester,

Michael Ying Yang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Liu_2024_CVPR, author = {Liu, Mengmeng and Cheng, Hao and Chen, Lin and Broszio, Hellward and Li, Jiangtao and Zhao, Runjiang and Sester, Monika and Yang, Michael Ying}, title = {LAformer: Trajectory Prediction for Autonomous Driving with Lane-Aware Scene Constraints}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2039-2049} }
Cross-Modal Fusion and Attention Mechanism for Weakly Supervised Video Anomaly Detection: Ayush Ghadiya,

Purbayan Kar,

Vishal Chudasama,

Pankaj Wasnik; [pdf]
[bibtex]
@InProceedings{Ghadiya_2024_CVPR, author = {Ghadiya, Ayush and Kar, Purbayan and Chudasama, Vishal and Wasnik, Pankaj}, title = {Cross-Modal Fusion and Attention Mechanism for Weakly Supervised Video Anomaly Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1965-1974} }
VMCML: Video and Music Matching via Cross-Modality Lifting: Yi-Shan Lee,

Wei-Cheng Tseng,

Fu-En Wang,

Min Sun; [pdf] [arXiv]
[bibtex]
@InProceedings{Lee_2024_CVPR, author = {Lee, Yi-Shan and Tseng, Wei-Cheng and Wang, Fu-En and Sun, Min}, title = {VMCML: Video and Music Matching via Cross-Modality Lifting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2060-2069} }
AIGeN: An Adversarial Approach for Instruction Generation in VLN: Niyati Rawal,

Roberto Bigazzi,

Lorenzo Baraldi,

Rita Cucchiara; [pdf] [arXiv]
[bibtex]
@InProceedings{Rawal_2024_CVPR, author = {Rawal, Niyati and Bigazzi, Roberto and Baraldi, Lorenzo and Cucchiara, Rita}, title = {AIGeN: An Adversarial Approach for Instruction Generation in VLN}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2070-2080} }
De-noised Vision-language Fusion Guided by Visual Cues for E-commerce Product Search: Zhizhang Hu,

Shasha Li,

Ming Du,

Arnab Dhua,

Douglas Gray; [pdf]
[bibtex]
@InProceedings{Hu_2024_CVPR, author = {Hu, Zhizhang and Li, Shasha and Du, Ming and Dhua, Arnab and Gray, Douglas}, title = {De-noised Vision-language Fusion Guided by Visual Cues for E-commerce Product Search}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1986-1996} }
Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning: Zaber Ibn Abdul Hakim,

Najibul Haque Sarker,

Rahul Pratap Singh,

Bishmoy Paul,

Ali Dabouei,

Min Xu; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Ibn_Abdul_Hakim_2024_CVPR, author = {Ibn Abdul Hakim, Zaber and Sarker, Najibul Haque and Singh, Rahul Pratap and Paul, Bishmoy and Dabouei, Ali and Xu, Min}, title = {Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1975-1985} }
RGB-D Cube R-CNN: 3D Object Detection with Selective Modality Dropout: Jens Piekenbrinck,

Alexander Hermans,

Narunas Vaskevicius,

Timm Linder,

Bastian Leibe; [pdf] [supp]
[bibtex]
@InProceedings{Piekenbrinck_2024_CVPR, author = {Piekenbrinck, Jens and Hermans, Alexander and Vaskevicius, Narunas and Linder, Timm and Leibe, Bastian}, title = {RGB-D Cube R-CNN: 3D Object Detection with Selective Modality Dropout}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1997-2006} }
Listen Then See: Video Alignment with Speaker Attention: Aviral Agrawal,

Carlos Mateo Samudio Lezcano,

Iqui Balam Heredia-Marin,

Prabhdeep Singh Sethi; [pdf] [arXiv]
[bibtex]
@InProceedings{Agrawal_2024_CVPR, author = {Agrawal, Aviral and Lezcano, Carlos Mateo Samudio and Heredia-Marin, Iqui Balam and Sethi, Prabhdeep Singh}, title = {Listen Then See: Video Alignment with Speaker Attention}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2018-2027} }
Multimodal Understanding of Memes with Fair Explanations: Yang Zhong,

Bhiman Kumar Baghel; [pdf]
[bibtex]
@InProceedings{Zhong_2024_CVPR, author = {Zhong, Yang and Baghel, Bhiman Kumar}, title = {Multimodal Understanding of Memes with Fair Explanations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2007-2017} }