Multimodal Learning and Applications
Learning To Ask Informative Sub-Questions for Visual Question Answering-
[pdf]
[bibtex]@InProceedings{Uehara_2022_CVPR, author = {Uehara, Kohei and Duan, Nan and Harada, Tatsuya}, title = {Learning To Ask Informative Sub-Questions for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4681-4690} }
Modulating Bottom-Up and Top-Down Visual Processing via Language-Conditional Filters-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kesen_2022_CVPR, author = {Kesen, Ilker and Can, Ozan Arkan and Erdem, Erkut and Erdem, Aykut and Y\"uret, Deniz}, title = {Modulating Bottom-Up and Top-Down Visual Processing via Language-Conditional Filters}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4610-4620} }
Reasoning With Multi-Structure Commonsense Knowledge in Visual Dialog-
[pdf]
[arXiv]
[bibtex]@InProceedings{Zhang_2022_CVPR, author = {Zhang, Shunyu and Jiang, Xiaoze and Yang, Zequn and Wan, Tao and Qin, Zengchang}, title = {Reasoning With Multi-Structure Commonsense Knowledge in Visual Dialog}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4600-4609} }
Emphasizing Complementary Samples for Non-Literal Cross-Modal Retrieval-
[pdf]
[supp]
[bibtex]@InProceedings{Thomas_2022_CVPR, author = {Thomas, Christopher and Kovashka, Adriana}, title = {Emphasizing Complementary Samples for Non-Literal Cross-Modal Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4632-4641} }
Transformer Decoders With MultiModal Regularization for Cross-Modal Food Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shukor_2022_CVPR, author = {Shukor, Mustafa and Couairon, Guillaume and Grechka, Asya and Cord, Matthieu}, title = {Transformer Decoders With MultiModal Regularization for Cross-Modal Food Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4567-4578} }
Doubling Down: Sparse Grounding With an Additional, Almost-Matching Caption for Detection-Oriented Multimodal Pretraining-
[pdf]
[bibtex]@InProceedings{Nebbia_2022_CVPR, author = {Nebbia, Giacomo and Kovashka, Adriana}, title = {Doubling Down: Sparse Grounding With an Additional, Almost-Matching Caption for Detection-Oriented Multimodal Pretraining}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4642-4651} }
Improving Multimodal Speech Recognition by Data Augmentation and Speech Representations-
[pdf]
[bibtex]@InProceedings{Oneata_2022_CVPR, author = {Oneaț\u{a}, Dan and Cucu, Horia}, title = {Improving Multimodal Speech Recognition by Data Augmentation and Speech Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4579-4588} }
The Unreasonable Effectiveness of CLIP Features for Image Captioning: An Experimental Analysis-
[pdf]
[bibtex]@InProceedings{Barraco_2022_CVPR, author = {Barraco, Manuele and Cornia, Marcella and Cascianelli, Silvia and Baraldi, Lorenzo and Cucchiara, Rita}, title = {The Unreasonable Effectiveness of CLIP Features for Image Captioning: An Experimental Analysis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4662-4670} }
M2FNet: Multi-Modal Fusion Network for Emotion Recognition in Conversation-
[pdf]
[supp]
[bibtex]@InProceedings{Chudasama_2022_CVPR, author = {Chudasama, Vishal and Kar, Purbayan and Gudmalwar, Ashish and Shah, Nirmesh and Wasnik, Pankaj and Onoe, Naoyuki}, title = {M2FNet: Multi-Modal Fusion Network for Emotion Recognition in Conversation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4652-4661} }
Semantically Grounded Visual Embeddings for Zero-Shot Learning-
[pdf]
[arXiv]
[bibtex]@InProceedings{Nawaz_2022_CVPR, author = {Nawaz, Shah and Cavazza, Jacopo and Del Bue, Alessio}, title = {Semantically Grounded Visual Embeddings for Zero-Shot Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4589-4599} }
Coarse-To-Fine Reasoning for Visual Question Answering-
[pdf]
[arXiv]
[bibtex]@InProceedings{Nguyen_2022_CVPR, author = {Nguyen, Binh X. and Do, Tuong and Tran, Huy and Tjiputra, Erman and Tran, Quang D. and Nguyen, Anh}, title = {Coarse-To-Fine Reasoning for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4558-4566} }
Cascaded Siamese Self-Supervised Audio to Video GAN-
[pdf]
[bibtex]@InProceedings{Aldausari_2022_CVPR, author = {Aldausari, Nuha and Sowmya, Arcot and Marcus, Nadine and Mohammadi, Gelareh}, title = {Cascaded Siamese Self-Supervised Audio to Video GAN}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4691-4700} }
Probabilistic Compositional Embeddings for Multimodal Image Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Neculai_2022_CVPR, author = {Neculai, Andrei and Chen, Yanbei and Akata, Zeynep}, title = {Probabilistic Compositional Embeddings for Multimodal Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4547-4557} }
Guiding Attention Using Partial-Order Relationships for Image Captioning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Popattia_2022_CVPR, author = {Popattia, Murad and Rafi, Muhammad and Qureshi, Rizwan and Nawaz, Shah}, title = {Guiding Attention Using Partial-Order Relationships for Image Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4671-4680} }
Coupling Vision and Proprioception for Navigation of Legged Robots-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Fu_2022_CVPR, author = {Fu, Zipeng and Kumar, Ashish and Agarwal, Ananye and Qi, Haozhi and Malik, Jitendra and Pathak, Deepak}, title = {Coupling Vision and Proprioception for Navigation of Legged Robots}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4621-4631} }
Multi-View Multi-Label Canonical Correlation Analysis for Cross-Modal Matching and Retrieval-
[pdf]
[bibtex]@InProceedings{Sanghavi_2022_CVPR, author = {Sanghavi, Rushil and Verma, Yashaswi}, title = {Multi-View Multi-Label Canonical Correlation Analysis for Cross-Modal Matching and Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2022}, pages = {4701-4710} }