CVPR 2021 Open Access Repository

Multimodal Learning and Applications

Radar Camera Fusion via Representation Learning in Autonomous Driving: Xu Dong,

Binnan Zhuang,

Yunxiang Mao,

Langechuan Liu; [pdf] [arXiv]
[bibtex]
@InProceedings{Dong_2021_CVPR, author = {Dong, Xu and Zhuang, Binnan and Mao, Yunxiang and Liu, Langechuan}, title = {Radar Camera Fusion via Representation Learning in Autonomous Driving}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1672-1681} }
An Improved Attention for Visual Question Answering: Tanzila Rahman,

Shih-Han Chou,

Leonid Sigal,

Giuseppe Carenini; [pdf] [arXiv]
[bibtex]
@InProceedings{Rahman_2021_CVPR, author = {Rahman, Tanzila and Chou, Shih-Han and Sigal, Leonid and Carenini, Giuseppe}, title = {An Improved Attention for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1653-1662} }
Private-Shared Disentangled Multimodal VAE for Learning of Latent Representations: Mihee Lee,

Vladimir Pavlovic; [pdf] [supp]
[bibtex]
@InProceedings{Lee_2021_CVPR, author = {Lee, Mihee and Pavlovic, Vladimir}, title = {Private-Shared Disentangled Multimodal VAE for Learning of Latent Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1692-1700} }
Dealing With Missing Modalities in the Visual Question Answer-Difference Prediction Task Through Knowledge Distillation: Jae Won Cho,

Dong-Jin Kim,

Jinsoo Choi,

Yunjae Jung,

In So Kweon; [pdf] [arXiv]
[bibtex]
@InProceedings{Cho_2021_CVPR, author = {Cho, Jae Won and Kim, Dong-Jin and Choi, Jinsoo and Jung, Yunjae and Kweon, In So}, title = {Dealing With Missing Modalities in the Visual Question Answer-Difference Prediction Task Through Knowledge Distillation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1592-1601} }
Self-Supervised Feature Learning by Cross-Modality and Cross-View Correspondences: Longlong Jing,

Ling Zhang,

Yingli Tian; [pdf] [arXiv]
[bibtex]
@InProceedings{Jing_2021_CVPR, author = {Jing, Longlong and Zhang, Ling and Tian, Yingli}, title = {Self-Supervised Feature Learning by Cross-Modality and Cross-View Correspondences}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1581-1591} }
Target-Tailored Source-Transformation for Scene Graph Generation: Wentong Liao,

Cuiling Lan,

Michael Ying Yang,

Wenjun Zeng,

Bodo Rosenhahn; [pdf] [arXiv]
[bibtex]
@InProceedings{Liao_2021_CVPR, author = {Liao, Wentong and Lan, Cuiling and Yang, Michael Ying and Zeng, Wenjun and Rosenhahn, Bodo}, title = {Target-Tailored Source-Transformation for Scene Graph Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1663-1671} }
Beyond VQA: Generating Multi-Word Answers and Rationales to Visual Questions: Radhika Dua,

Sai Srinivas Kancheti,

Vineeth N Balasubramanian; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Dua_2021_CVPR, author = {Dua, Radhika and Kancheti, Sai Srinivas and Balasubramanian, Vineeth N}, title = {Beyond VQA: Generating Multi-Word Answers and Rationales to Visual Questions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1623-1632} }
Adaptive Intermediate Representations for Video Understanding: Juhana Kangaspunta,

AJ Piergiovanni,

Rico Jonschkowski,

Michael Ryoo,

Anelia Angelova; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Kangaspunta_2021_CVPR, author = {Kangaspunta, Juhana and Piergiovanni, AJ and Jonschkowski, Rico and Ryoo, Michael and Angelova, Anelia}, title = {Adaptive Intermediate Representations for Video Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1602-1612} }
Exploring the Limits of Zero-Shot Learning - How Low Can You Go?: Hemanth Dandu,

Karan Sharma,

Suchendra M. Bhandarkar; [pdf]
[bibtex]
@InProceedings{Dandu_2021_CVPR, author = {Dandu, Hemanth and Sharma, Karan and Bhandarkar, Suchendra M.}, title = {Exploring the Limits of Zero-Shot Learning - How Low Can You Go?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1710-1719} }
Progressive Knowledge-Embedded Unified Perceptual Parsing for Scene Understanding: Wenbo Zheng,

Lan Yan,

Fei-Yue Wang,

Chao Gou; [pdf]
[bibtex]
@InProceedings{Zheng_2021_CVPR, author = {Zheng, Wenbo and Yan, Lan and Wang, Fei-Yue and Gou, Chao}, title = {Progressive Knowledge-Embedded Unified Perceptual Parsing for Scene Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1633-1642} }
APES: Audiovisual Person Search in Untrimmed Video: Juan Leon Alcazar,

Fabian Caba,

Long Mai,

Federico Perazzi,

Joon-Young Lee,

Pablo Arbelaez,

Bernard Ghanem; [pdf] [arXiv]
[bibtex]
@InProceedings{Alcazar_2021_CVPR, author = {Alcazar, Juan Leon and Caba, Fabian and Mai, Long and Perazzi, Federico and Lee, Joon-Young and Arbelaez, Pablo and Ghanem, Bernard}, title = {APES: Audiovisual Person Search in Untrimmed Video}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1720-1729} }
Practical Cross-Modal Manifold Alignment for Robotic Grounded Language Learning: Andre T. Nguyen,

Luke E. Richards,

Gaoussou Youssouf Kebe,

Edward Raff,

Kasra Darvish,

Frank Ferraro,

Cynthia Matuszek; [pdf]
[bibtex]
@InProceedings{Nguyen_2021_CVPR, author = {Nguyen, Andre T. and Richards, Luke E. and Kebe, Gaoussou Youssouf and Raff, Edward and Darvish, Kasra and Ferraro, Frank and Matuszek, Cynthia}, title = {Practical Cross-Modal Manifold Alignment for Robotic Grounded Language Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1613-1622} }
Cross-Modal Speaker Verification and Recognition: A Multilingual Perspective: Shah Nawaz,

Muhammad Saad Saeed,

Pietro Morerio,

Arif Mahmood,

Ignazio Gallo,

Muhammad Haroon Yousaf,

Alessio Del Bue; [pdf] [arXiv]
[bibtex]
@InProceedings{Nawaz_2021_CVPR, author = {Nawaz, Shah and Saeed, Muhammad Saad and Morerio, Pietro and Mahmood, Arif and Gallo, Ignazio and Yousaf, Muhammad Haroon and Del Bue, Alessio}, title = {Cross-Modal Speaker Verification and Recognition: A Multilingual Perspective}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1682-1691} }
Using Text To Teach Image Retrieval: Haoyu Dong,

Ze Wang,

Qiang Qiu,

Guillermo Sapiro; [pdf] [arXiv]
[bibtex]
@InProceedings{Dong_2021_CVPR, author = {Dong, Haoyu and Wang, Ze and Qiu, Qiang and Sapiro, Guillermo}, title = {Using Text To Teach Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1643-1652} }
Editing Like Humans: A Contextual, Multimodal Framework for Automated Video Editing: Sharath Koorathota,

Patrick Adelman,

Kelly Cotton,

Paul Sajda; [pdf] [supp]
[bibtex]
@InProceedings{Koorathota_2021_CVPR, author = {Koorathota, Sharath and Adelman, Patrick and Cotton, Kelly and Sajda, Paul}, title = {Editing Like Humans: A Contextual, Multimodal Framework for Automated Video Editing}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1701-1709} }
3D Hand Pose Estimation via Aligned Latent Space Injection and Kinematic Losses: Andreas Stergioulas,

Theocharis Chatzis,

Dimitrios Konstantinidis,

Kosmas Dimitropoulos,

Petros Daras; [pdf]
[bibtex]
@InProceedings{Stergioulas_2021_CVPR, author = {Stergioulas, Andreas and Chatzis, Theocharis and Konstantinidis, Dimitrios and Dimitropoulos, Kosmas and Daras, Petros}, title = {3D Hand Pose Estimation via Aligned Latent Space Injection and Kinematic Losses}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2021}, pages = {1730-1739} }