7th MUltimodal Learning and Applications
InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation-
[pdf]
[supp]
[bibtex]@InProceedings{Deria_2024_CVPR, author = {Deria, Ankan and Kumar, Komal and Chakraborty, Snehashis and Mahapatra, Dwarikanath and Roy, Sudipta}, title = {InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2028-2038} }
ZInD-Tell: Towards Translating Indoor Panoramas into Descriptions-
[pdf]
[supp]
[bibtex]@InProceedings{Deb_2024_CVPR, author = {Deb, Tonmoay and Wang, Lichen and Bessinger, Zachary and Khosravan, Naji and Penner, Eric and Kang, Sing Bing}, title = {ZInD-Tell: Towards Translating Indoor Panoramas into Descriptions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2050-2059} }
Exploring the Role of Audio in Video Captioning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shen_2024_CVPR, author = {Shen, Yuhan and Yang, Linjie and Wen, Longyin and Yu, Haichao and Elhamifar, Ehsan and Wang, Heng}, title = {Exploring the Role of Audio in Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2090-2100} }
Multi-Modal Fusion of Event and RGB for Monocular Depth Estimation Using a Unified Transformer-based Architecture-
[pdf]
[supp]
[bibtex]@InProceedings{Devulapally_2024_CVPR, author = {Devulapally, Anusha and Khan, Md Fahim Faysal and Advani, Siddharth and Narayanan, Vijaykrishnan}, title = {Multi-Modal Fusion of Event and RGB for Monocular Depth Estimation Using a Unified Transformer-based Architecture}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2081-2089} }
LAformer: Trajectory Prediction for Autonomous Driving with Lane-Aware Scene Constraints-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Mengmeng and Cheng, Hao and Chen, Lin and Broszio, Hellward and Li, Jiangtao and Zhao, Runjiang and Sester, Monika and Yang, Michael Ying}, title = {LAformer: Trajectory Prediction for Autonomous Driving with Lane-Aware Scene Constraints}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2039-2049} }
Cross-Modal Fusion and Attention Mechanism for Weakly Supervised Video Anomaly Detection-
[pdf]
[bibtex]@InProceedings{Ghadiya_2024_CVPR, author = {Ghadiya, Ayush and Kar, Purbayan and Chudasama, Vishal and Wasnik, Pankaj}, title = {Cross-Modal Fusion and Attention Mechanism for Weakly Supervised Video Anomaly Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1965-1974} }
VMCML: Video and Music Matching via Cross-Modality Lifting-
[pdf]
[arXiv]
[bibtex]@InProceedings{Lee_2024_CVPR, author = {Lee, Yi-Shan and Tseng, Wei-Cheng and Wang, Fu-En and Sun, Min}, title = {VMCML: Video and Music Matching via Cross-Modality Lifting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2060-2069} }
AIGeN: An Adversarial Approach for Instruction Generation in VLN-
[pdf]
[arXiv]
[bibtex]@InProceedings{Rawal_2024_CVPR, author = {Rawal, Niyati and Bigazzi, Roberto and Baraldi, Lorenzo and Cucchiara, Rita}, title = {AIGeN: An Adversarial Approach for Instruction Generation in VLN}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2070-2080} }
De-noised Vision-language Fusion Guided by Visual Cues for E-commerce Product Search-
[pdf]
[bibtex]@InProceedings{Hu_2024_CVPR, author = {Hu, Zhizhang and Li, Shasha and Du, Ming and Dhua, Arnab and Gray, Douglas}, title = {De-noised Vision-language Fusion Guided by Visual Cues for E-commerce Product Search}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1986-1996} }
Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ibn_Abdul_Hakim_2024_CVPR, author = {Ibn Abdul Hakim, Zaber and Sarker, Najibul Haque and Singh, Rahul Pratap and Paul, Bishmoy and Dabouei, Ali and Xu, Min}, title = {Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1975-1985} }
RGB-D Cube R-CNN: 3D Object Detection with Selective Modality Dropout-
[pdf]
[supp]
[bibtex]@InProceedings{Piekenbrinck_2024_CVPR, author = {Piekenbrinck, Jens and Hermans, Alexander and Vaskevicius, Narunas and Linder, Timm and Leibe, Bastian}, title = {RGB-D Cube R-CNN: 3D Object Detection with Selective Modality Dropout}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {1997-2006} }
Listen Then See: Video Alignment with Speaker Attention-
[pdf]
[arXiv]
[bibtex]@InProceedings{Agrawal_2024_CVPR, author = {Agrawal, Aviral and Lezcano, Carlos Mateo Samudio and Heredia-Marin, Iqui Balam and Sethi, Prabhdeep Singh}, title = {Listen Then See: Video Alignment with Speaker Attention}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2018-2027} }
Multimodal Understanding of Memes with Fair Explanations-
[pdf]
[bibtex]@InProceedings{Zhong_2024_CVPR, author = {Zhong, Yang and Baghel, Bhiman Kumar}, title = {Multimodal Understanding of Memes with Fair Explanations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2007-2017} }