Multimodal Learning and Applications


Learning To Ask Informative Sub-Questions for Visual Question Answering
Kohei Uehara,
Nan Duan,
Tatsuya Harada
[pdf]
[bibtex]
@InProceedings{Uehara_2022_CVPR, author = {Uehara, Kohei and Duan, Nan and Harada, Tatsuya}, title = {Learning To Ask Informative Sub-Questions for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4681-4690} }

Modulating Bottom-Up and Top-Down Visual Processing via Language-Conditional Filters
Ilker Kesen,
Ozan Arkan Can,
Erkut Erdem,
Aykut Erdem,
Deniz Yüret
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Kesen_2022_CVPR, author = {Kesen, Ilker and Can, Ozan Arkan and Erdem, Erkut and Erdem, Aykut and Y\"uret, Deniz}, title = {Modulating Bottom-Up and Top-Down Visual Processing via Language-Conditional Filters}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4610-4620} }

Reasoning With Multi-Structure Commonsense Knowledge in Visual Dialog
Shunyu Zhang,
Xiaoze Jiang,
Zequn Yang,
Tao Wan,
Zengchang Qin
[pdf] [arXiv]
[bibtex]
@InProceedings{Zhang_2022_CVPR, author = {Zhang, Shunyu and Jiang, Xiaoze and Yang, Zequn and Wan, Tao and Qin, Zengchang}, title = {Reasoning With Multi-Structure Commonsense Knowledge in Visual Dialog}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4600-4609} }

Emphasizing Complementary Samples for Non-Literal Cross-Modal Retrieval
Christopher Thomas,
Adriana Kovashka
[pdf] [supp]
[bibtex]
@InProceedings{Thomas_2022_CVPR, author = {Thomas, Christopher and Kovashka, Adriana}, title = {Emphasizing Complementary Samples for Non-Literal Cross-Modal Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4632-4641} }

Transformer Decoders With MultiModal Regularization for Cross-Modal Food Retrieval
Mustafa Shukor,
Guillaume Couairon,
Asya Grechka,
Matthieu Cord
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Shukor_2022_CVPR, author = {Shukor, Mustafa and Couairon, Guillaume and Grechka, Asya and Cord, Matthieu}, title = {Transformer Decoders With MultiModal Regularization for Cross-Modal Food Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4567-4578} }

Doubling Down: Sparse Grounding With an Additional, Almost-Matching Caption for Detection-Oriented Multimodal Pretraining
Giacomo Nebbia,
Adriana Kovashka
[pdf]
[bibtex]
@InProceedings{Nebbia_2022_CVPR, author = {Nebbia, Giacomo and Kovashka, Adriana}, title = {Doubling Down: Sparse Grounding With an Additional, Almost-Matching Caption for Detection-Oriented Multimodal Pretraining}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4642-4651} }

Improving Multimodal Speech Recognition by Data Augmentation and Speech Representations
Dan Oneață,
Horia Cucu
[pdf]
[bibtex]
@InProceedings{Oneata_2022_CVPR, author = {Oneaț\u{a}, Dan and Cucu, Horia}, title = {Improving Multimodal Speech Recognition by Data Augmentation and Speech Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4579-4588} }

The Unreasonable Effectiveness of CLIP Features for Image Captioning: An Experimental Analysis
Manuele Barraco,
Marcella Cornia,
Silvia Cascianelli,
Lorenzo Baraldi,
Rita Cucchiara
[pdf]
[bibtex]
@InProceedings{Barraco_2022_CVPR, author = {Barraco, Manuele and Cornia, Marcella and Cascianelli, Silvia and Baraldi, Lorenzo and Cucchiara, Rita}, title = {The Unreasonable Effectiveness of CLIP Features for Image Captioning: An Experimental Analysis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4662-4670} }

M2FNet: Multi-Modal Fusion Network for Emotion Recognition in Conversation
Vishal Chudasama,
Purbayan Kar,
Ashish Gudmalwar,
Nirmesh Shah,
Pankaj Wasnik,
Naoyuki Onoe
[pdf] [supp]
[bibtex]
@InProceedings{Chudasama_2022_CVPR, author = {Chudasama, Vishal and Kar, Purbayan and Gudmalwar, Ashish and Shah, Nirmesh and Wasnik, Pankaj and Onoe, Naoyuki}, title = {M2FNet: Multi-Modal Fusion Network for Emotion Recognition in Conversation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4652-4661} }

Semantically Grounded Visual Embeddings for Zero-Shot Learning
Shah Nawaz,
Jacopo Cavazza,
Alessio Del Bue
[pdf] [arXiv]
[bibtex]
@InProceedings{Nawaz_2022_CVPR, author = {Nawaz, Shah and Cavazza, Jacopo and Del Bue, Alessio}, title = {Semantically Grounded Visual Embeddings for Zero-Shot Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4589-4599} }

Coarse-To-Fine Reasoning for Visual Question Answering
Binh X. Nguyen,
Tuong Do,
Huy Tran,
Erman Tjiputra,
Quang D. Tran,
Anh Nguyen
[pdf] [arXiv]
[bibtex]
@InProceedings{Nguyen_2022_CVPR, author = {Nguyen, Binh X. and Do, Tuong and Tran, Huy and Tjiputra, Erman and Tran, Quang D. and Nguyen, Anh}, title = {Coarse-To-Fine Reasoning for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4558-4566} }

Cascaded Siamese Self-Supervised Audio to Video GAN
Nuha Aldausari,
Arcot Sowmya,
Nadine Marcus,
Gelareh Mohammadi
[pdf]
[bibtex]
@InProceedings{Aldausari_2022_CVPR, author = {Aldausari, Nuha and Sowmya, Arcot and Marcus, Nadine and Mohammadi, Gelareh}, title = {Cascaded Siamese Self-Supervised Audio to Video GAN}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4691-4700} }

Probabilistic Compositional Embeddings for Multimodal Image Retrieval
Andrei Neculai,
Yanbei Chen,
Zeynep Akata
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Neculai_2022_CVPR, author = {Neculai, Andrei and Chen, Yanbei and Akata, Zeynep}, title = {Probabilistic Compositional Embeddings for Multimodal Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4547-4557} }

Guiding Attention Using Partial-Order Relationships for Image Captioning
Murad Popattia,
Muhammad Rafi,
Rizwan Qureshi,
Shah Nawaz
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Popattia_2022_CVPR, author = {Popattia, Murad and Rafi, Muhammad and Qureshi, Rizwan and Nawaz, Shah}, title = {Guiding Attention Using Partial-Order Relationships for Image Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4671-4680} }

Coupling Vision and Proprioception for Navigation of Legged Robots
Zipeng Fu,
Ashish Kumar,
Ananye Agarwal,
Haozhi Qi,
Jitendra Malik,
Deepak Pathak
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Fu_2022_CVPR, author = {Fu, Zipeng and Kumar, Ashish and Agarwal, Ananye and Qi, Haozhi and Malik, Jitendra and Pathak, Deepak}, title = {Coupling Vision and Proprioception for Navigation of Legged Robots}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4621-4631} }

Multi-View Multi-Label Canonical Correlation Analysis for Cross-Modal Matching and Retrieval
Rushil Sanghavi,
Yashaswi Verma
[pdf]
[bibtex]
@InProceedings{Sanghavi_2022_CVPR, author = {Sanghavi, Rushil and Verma, Yashaswi}, title = {Multi-View Multi-Label Canonical Correlation Analysis for Cross-Modal Matching and Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4701-4710} }