CVPR 2022 Open Access Repository

Multimodal Learning and Applications

Learning To Ask Informative Sub-Questions for Visual Question Answering: Kohei Uehara,

Nan Duan,

Tatsuya Harada; [pdf]
[bibtex]
@InProceedings{Uehara_2022_CVPR, author = {Uehara, Kohei and Duan, Nan and Harada, Tatsuya}, title = {Learning To Ask Informative Sub-Questions for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4681-4690} }
Modulating Bottom-Up and Top-Down Visual Processing via Language-Conditional Filters: Ilker Kesen,

Ozan Arkan Can,

Erkut Erdem,

Aykut Erdem,

Deniz Yüret; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Kesen_2022_CVPR, author = {Kesen, Ilker and Can, Ozan Arkan and Erdem, Erkut and Erdem, Aykut and Y\"uret, Deniz}, title = {Modulating Bottom-Up and Top-Down Visual Processing via Language-Conditional Filters}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4610-4620} }
Reasoning With Multi-Structure Commonsense Knowledge in Visual Dialog: Shunyu Zhang,

Xiaoze Jiang,

Zequn Yang,

Tao Wan,

Zengchang Qin; [pdf] [arXiv]
[bibtex]
@InProceedings{Zhang_2022_CVPR, author = {Zhang, Shunyu and Jiang, Xiaoze and Yang, Zequn and Wan, Tao and Qin, Zengchang}, title = {Reasoning With Multi-Structure Commonsense Knowledge in Visual Dialog}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4600-4609} }
Emphasizing Complementary Samples for Non-Literal Cross-Modal Retrieval: Christopher Thomas,

Adriana Kovashka; [pdf] [supp]
[bibtex]
@InProceedings{Thomas_2022_CVPR, author = {Thomas, Christopher and Kovashka, Adriana}, title = {Emphasizing Complementary Samples for Non-Literal Cross-Modal Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4632-4641} }
Transformer Decoders With MultiModal Regularization for Cross-Modal Food Retrieval: Mustafa Shukor,

Guillaume Couairon,

Asya Grechka,

Matthieu Cord; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Shukor_2022_CVPR, author = {Shukor, Mustafa and Couairon, Guillaume and Grechka, Asya and Cord, Matthieu}, title = {Transformer Decoders With MultiModal Regularization for Cross-Modal Food Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4567-4578} }
Doubling Down: Sparse Grounding With an Additional, Almost-Matching Caption for Detection-Oriented Multimodal Pretraining: Giacomo Nebbia,

Adriana Kovashka; [pdf]
[bibtex]
@InProceedings{Nebbia_2022_CVPR, author = {Nebbia, Giacomo and Kovashka, Adriana}, title = {Doubling Down: Sparse Grounding With an Additional, Almost-Matching Caption for Detection-Oriented Multimodal Pretraining}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4642-4651} }
Improving Multimodal Speech Recognition by Data Augmentation and Speech Representations: Dan Oneață,

Horia Cucu; [pdf]
[bibtex]
@InProceedings{Oneata_2022_CVPR, author = {Oneaț\u{a}, Dan and Cucu, Horia}, title = {Improving Multimodal Speech Recognition by Data Augmentation and Speech Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4579-4588} }
The Unreasonable Effectiveness of CLIP Features for Image Captioning: An Experimental Analysis: Manuele Barraco,

Marcella Cornia,

Silvia Cascianelli,

Lorenzo Baraldi,

Rita Cucchiara; [pdf]
[bibtex]
@InProceedings{Barraco_2022_CVPR, author = {Barraco, Manuele and Cornia, Marcella and Cascianelli, Silvia and Baraldi, Lorenzo and Cucchiara, Rita}, title = {The Unreasonable Effectiveness of CLIP Features for Image Captioning: An Experimental Analysis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4662-4670} }
M2FNet: Multi-Modal Fusion Network for Emotion Recognition in Conversation: Vishal Chudasama,

Purbayan Kar,

Ashish Gudmalwar,

Nirmesh Shah,

Pankaj Wasnik,

Naoyuki Onoe; [pdf] [supp]
[bibtex]
@InProceedings{Chudasama_2022_CVPR, author = {Chudasama, Vishal and Kar, Purbayan and Gudmalwar, Ashish and Shah, Nirmesh and Wasnik, Pankaj and Onoe, Naoyuki}, title = {M2FNet: Multi-Modal Fusion Network for Emotion Recognition in Conversation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4652-4661} }
Semantically Grounded Visual Embeddings for Zero-Shot Learning: Shah Nawaz,

Jacopo Cavazza,

Alessio Del Bue; [pdf] [arXiv]
[bibtex]
@InProceedings{Nawaz_2022_CVPR, author = {Nawaz, Shah and Cavazza, Jacopo and Del Bue, Alessio}, title = {Semantically Grounded Visual Embeddings for Zero-Shot Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4589-4599} }
Coarse-To-Fine Reasoning for Visual Question Answering: Binh X. Nguyen,

Tuong Do,

Huy Tran,

Erman Tjiputra,

Quang D. Tran,

Anh Nguyen; [pdf] [arXiv]
[bibtex]
@InProceedings{Nguyen_2022_CVPR, author = {Nguyen, Binh X. and Do, Tuong and Tran, Huy and Tjiputra, Erman and Tran, Quang D. and Nguyen, Anh}, title = {Coarse-To-Fine Reasoning for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4558-4566} }
Cascaded Siamese Self-Supervised Audio to Video GAN: Nuha Aldausari,

Arcot Sowmya,

Nadine Marcus,

Gelareh Mohammadi; [pdf]
[bibtex]
@InProceedings{Aldausari_2022_CVPR, author = {Aldausari, Nuha and Sowmya, Arcot and Marcus, Nadine and Mohammadi, Gelareh}, title = {Cascaded Siamese Self-Supervised Audio to Video GAN}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4691-4700} }
Probabilistic Compositional Embeddings for Multimodal Image Retrieval: Andrei Neculai,

Yanbei Chen,

Zeynep Akata; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Neculai_2022_CVPR, author = {Neculai, Andrei and Chen, Yanbei and Akata, Zeynep}, title = {Probabilistic Compositional Embeddings for Multimodal Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4547-4557} }
Guiding Attention Using Partial-Order Relationships for Image Captioning: Murad Popattia,

Muhammad Rafi,

Rizwan Qureshi,

Shah Nawaz; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Popattia_2022_CVPR, author = {Popattia, Murad and Rafi, Muhammad and Qureshi, Rizwan and Nawaz, Shah}, title = {Guiding Attention Using Partial-Order Relationships for Image Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4671-4680} }
Coupling Vision and Proprioception for Navigation of Legged Robots: Zipeng Fu,

Ashish Kumar,

Ananye Agarwal,

Haozhi Qi,

Jitendra Malik,

Deepak Pathak; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Fu_2022_CVPR, author = {Fu, Zipeng and Kumar, Ashish and Agarwal, Ananye and Qi, Haozhi and Malik, Jitendra and Pathak, Deepak}, title = {Coupling Vision and Proprioception for Navigation of Legged Robots}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4621-4631} }
Multi-View Multi-Label Canonical Correlation Analysis for Cross-Modal Matching and Retrieval: Rushil Sanghavi,

Yashaswi Verma; [pdf]
[bibtex]
@InProceedings{Sanghavi_2022_CVPR, author = {Sanghavi, Rushil and Verma, Yashaswi}, title = {Multi-View Multi-Label Canonical Correlation Analysis for Cross-Modal Matching and Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4701-4710} }