Closing the Loop Between Vision and Language
Visual Question Answering With Textual Representations for Images-
[pdf]
[supp]
[bibtex]@InProceedings{Hirota_2021_ICCV, author = {Hirota, Yusuke and Garcia, Noa and Otani, Mayu and Chu, Chenhui and Nakashima, Yuta and Taniguchi, Ittetsu and Onoye, Takao}, title = {Visual Question Answering With Textual Representations for Images}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3154-3157} }
Semi-Autoregressive Transformer for Image Captioning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2021_ICCV, author = {Zhou, Yuanen and Zhang, Yong and Hu, Zhenzhen and Wang, Meng}, title = {Semi-Autoregressive Transformer for Image Captioning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3139-3143} }
What You Say Is Not What You Do: Studying Visio-Linguistic Models for TV Series Summarization-
[pdf]
[bibtex]@InProceedings{Reboud_2021_ICCV, author = {Reboud, Alison and Troncy, Rapha\"el}, title = {What You Say Is Not What You Do: Studying Visio-Linguistic Models for TV Series Summarization}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3149-3153} }
Egocentric Biochemical Video-and-Language Dataset-
[pdf]
[bibtex]@InProceedings{Nishimura_2021_ICCV, author = {Nishimura, Taichi and Sakoda, Kojiro and Hashimoto, Atsushi and Ushiku, Yoshitaka and Tanaka, Natsuko and Ono, Fumihito and Kameko, Hirotaka and Mori, Shinsuke}, title = {Egocentric Biochemical Video-and-Language Dataset}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3129-3133} }
CIGLI: Conditional Image Generation From Language & Image-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lu_2021_ICCV, author = {Lu, Xiaopeng and Ng, Lynnette and Fernandez, Jared and Zhu, Hao}, title = {CIGLI: Conditional Image Generation From Language \& Image}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3134-3138} }
Latent Variable Models for Visual Question Answering-
[pdf]
[arXiv]
[bibtex]@InProceedings{Wang_2021_ICCV, author = {Wang, Zixu and Miao, Yishu and Specia, Lucia}, title = {Latent Variable Models for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3144-3148} }
Language-Guided Multi-Modal Fusion for Video Action Recognition-
[pdf]
[bibtex]@InProceedings{Hsiao_2021_ICCV, author = {Hsiao, Jenhao and Li, Yikang and Ho, Chiuman}, title = {Language-Guided Multi-Modal Fusion for Video Action Recognition}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3158-3162} }