Multimodal Learning and Applications


Radar Camera Fusion via Representation Learning in Autonomous Driving
Xu Dong,
Binnan Zhuang,
Yunxiang Mao,
Langechuan Liu
[pdf] [arXiv]
[bibtex]
@InProceedings{Dong_2021_CVPR, author = {Dong, Xu and Zhuang, Binnan and Mao, Yunxiang and Liu, Langechuan}, title = {Radar Camera Fusion via Representation Learning in Autonomous Driving}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1672-1681} }

An Improved Attention for Visual Question Answering
Tanzila Rahman,
Shih-Han Chou,
Leonid Sigal,
Giuseppe Carenini
[pdf] [arXiv]
[bibtex]
@InProceedings{Rahman_2021_CVPR, author = {Rahman, Tanzila and Chou, Shih-Han and Sigal, Leonid and Carenini, Giuseppe}, title = {An Improved Attention for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1653-1662} }

Private-Shared Disentangled Multimodal VAE for Learning of Latent Representations
Mihee Lee,
Vladimir Pavlovic
[pdf] [supp]
[bibtex]
@InProceedings{Lee_2021_CVPR, author = {Lee, Mihee and Pavlovic, Vladimir}, title = {Private-Shared Disentangled Multimodal VAE for Learning of Latent Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1692-1700} }

Dealing With Missing Modalities in the Visual Question Answer-Difference Prediction Task Through Knowledge Distillation
Jae Won Cho,
Dong-Jin Kim,
Jinsoo Choi,
Yunjae Jung,
In So Kweon
[pdf] [arXiv]
[bibtex]
@InProceedings{Cho_2021_CVPR, author = {Cho, Jae Won and Kim, Dong-Jin and Choi, Jinsoo and Jung, Yunjae and Kweon, In So}, title = {Dealing With Missing Modalities in the Visual Question Answer-Difference Prediction Task Through Knowledge Distillation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1592-1601} }

Self-Supervised Feature Learning by Cross-Modality and Cross-View Correspondences
Longlong Jing,
Ling Zhang,
Yingli Tian
[pdf] [arXiv]
[bibtex]
@InProceedings{Jing_2021_CVPR, author = {Jing, Longlong and Zhang, Ling and Tian, Yingli}, title = {Self-Supervised Feature Learning by Cross-Modality and Cross-View Correspondences}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1581-1591} }

Target-Tailored Source-Transformation for Scene Graph Generation
Wentong Liao,
Cuiling Lan,
Michael Ying Yang,
Wenjun Zeng,
Bodo Rosenhahn
[pdf] [arXiv]
[bibtex]
@InProceedings{Liao_2021_CVPR, author = {Liao, Wentong and Lan, Cuiling and Yang, Michael Ying and Zeng, Wenjun and Rosenhahn, Bodo}, title = {Target-Tailored Source-Transformation for Scene Graph Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1663-1671} }

Beyond VQA: Generating Multi-Word Answers and Rationales to Visual Questions
Radhika Dua,
Sai Srinivas Kancheti,
Vineeth N Balasubramanian
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Dua_2021_CVPR, author = {Dua, Radhika and Kancheti, Sai Srinivas and Balasubramanian, Vineeth N}, title = {Beyond VQA: Generating Multi-Word Answers and Rationales to Visual Questions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1623-1632} }

Adaptive Intermediate Representations for Video Understanding
Juhana Kangaspunta,
AJ Piergiovanni,
Rico Jonschkowski,
Michael Ryoo,
Anelia Angelova
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Kangaspunta_2021_CVPR, author = {Kangaspunta, Juhana and Piergiovanni, AJ and Jonschkowski, Rico and Ryoo, Michael and Angelova, Anelia}, title = {Adaptive Intermediate Representations for Video Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1602-1612} }

Exploring the Limits of Zero-Shot Learning - How Low Can You Go?
Hemanth Dandu,
Karan Sharma,
Suchendra M. Bhandarkar
[pdf]
[bibtex]
@InProceedings{Dandu_2021_CVPR, author = {Dandu, Hemanth and Sharma, Karan and Bhandarkar, Suchendra M.}, title = {Exploring the Limits of Zero-Shot Learning - How Low Can You Go?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1710-1719} }

Progressive Knowledge-Embedded Unified Perceptual Parsing for Scene Understanding
Wenbo Zheng,
Lan Yan,
Fei-Yue Wang,
Chao Gou
[pdf]
[bibtex]
@InProceedings{Zheng_2021_CVPR, author = {Zheng, Wenbo and Yan, Lan and Wang, Fei-Yue and Gou, Chao}, title = {Progressive Knowledge-Embedded Unified Perceptual Parsing for Scene Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1633-1642} }

APES: Audiovisual Person Search in Untrimmed Video
Juan Leon Alcazar,
Fabian Caba,
Long Mai,
Federico Perazzi,
Joon-Young Lee,
Pablo Arbelaez,
Bernard Ghanem
[pdf] [arXiv]
[bibtex]
@InProceedings{Alcazar_2021_CVPR, author = {Alcazar, Juan Leon and Caba, Fabian and Mai, Long and Perazzi, Federico and Lee, Joon-Young and Arbelaez, Pablo and Ghanem, Bernard}, title = {APES: Audiovisual Person Search in Untrimmed Video}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1720-1729} }

Practical Cross-Modal Manifold Alignment for Robotic Grounded Language Learning
Andre T. Nguyen,
Luke E. Richards,
Gaoussou Youssouf Kebe,
Edward Raff,
Kasra Darvish,
Frank Ferraro,
Cynthia Matuszek
[pdf]
[bibtex]
@InProceedings{Nguyen_2021_CVPR, author = {Nguyen, Andre T. and Richards, Luke E. and Kebe, Gaoussou Youssouf and Raff, Edward and Darvish, Kasra and Ferraro, Frank and Matuszek, Cynthia}, title = {Practical Cross-Modal Manifold Alignment for Robotic Grounded Language Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1613-1622} }

Cross-Modal Speaker Verification and Recognition: A Multilingual Perspective
Shah Nawaz,
Muhammad Saad Saeed,
Pietro Morerio,
Arif Mahmood,
Ignazio Gallo,
Muhammad Haroon Yousaf,
Alessio Del Bue
[pdf] [arXiv]
[bibtex]
@InProceedings{Nawaz_2021_CVPR, author = {Nawaz, Shah and Saeed, Muhammad Saad and Morerio, Pietro and Mahmood, Arif and Gallo, Ignazio and Yousaf, Muhammad Haroon and Del Bue, Alessio}, title = {Cross-Modal Speaker Verification and Recognition: A Multilingual Perspective}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1682-1691} }

Using Text To Teach Image Retrieval
Haoyu Dong,
Ze Wang,
Qiang Qiu,
Guillermo Sapiro
[pdf] [arXiv]
[bibtex]
@InProceedings{Dong_2021_CVPR, author = {Dong, Haoyu and Wang, Ze and Qiu, Qiang and Sapiro, Guillermo}, title = {Using Text To Teach Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1643-1652} }

Editing Like Humans: A Contextual, Multimodal Framework for Automated Video Editing
Sharath Koorathota,
Patrick Adelman,
Kelly Cotton,
Paul Sajda
[pdf] [supp]
[bibtex]
@InProceedings{Koorathota_2021_CVPR, author = {Koorathota, Sharath and Adelman, Patrick and Cotton, Kelly and Sajda, Paul}, title = {Editing Like Humans: A Contextual, Multimodal Framework for Automated Video Editing}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1701-1709} }

3D Hand Pose Estimation via Aligned Latent Space Injection and Kinematic Losses
Andreas Stergioulas,
Theocharis Chatzis,
Dimitrios Konstantinidis,
Kosmas Dimitropoulos,
Petros Daras
[pdf]
[bibtex]
@InProceedings{Stergioulas_2021_CVPR, author = {Stergioulas, Andreas and Chatzis, Theocharis and Konstantinidis, Dimitrios and Dimitropoulos, Kosmas and Daras, Petros}, title = {3D Hand Pose Estimation via Aligned Latent Space Injection and Kinematic Losses}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1730-1739} }