Closing the Loop Between Vision and Language


Visual Question Answering With Textual Representations for Images
Yusuke Hirota,
Noa Garcia,
Mayu Otani,
Chenhui Chu,
Yuta Nakashima,
Ittetsu Taniguchi,
Takao Onoye
[pdf] [supp]
[bibtex]
@InProceedings{Hirota_2021_ICCV, author = {Hirota, Yusuke and Garcia, Noa and Otani, Mayu and Chu, Chenhui and Nakashima, Yuta and Taniguchi, Ittetsu and Onoye, Takao}, title = {Visual Question Answering With Textual Representations for Images}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3154-3157} }

Semi-Autoregressive Transformer for Image Captioning
Yuanen Zhou,
Yong Zhang,
Zhenzhen Hu,
Meng Wang
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Zhou_2021_ICCV, author = {Zhou, Yuanen and Zhang, Yong and Hu, Zhenzhen and Wang, Meng}, title = {Semi-Autoregressive Transformer for Image Captioning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3139-3143} }

CIGLI: Conditional Image Generation From Language & Image
Xiaopeng Lu,
Lynnette Ng,
Jared Fernandez,
Hao Zhu
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lu_2021_ICCV, author = {Lu, Xiaopeng and Ng, Lynnette and Fernandez, Jared and Zhu, Hao}, title = {CIGLI: Conditional Image Generation From Language & Image}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3134-3138} }

Language-Guided Multi-Modal Fusion for Video Action Recognition
Jenhao Hsiao,
Yikang Li,
Chiuman Ho
[pdf]
[bibtex]
@InProceedings{Hsiao_2021_ICCV, author = {Hsiao, Jenhao and Li, Yikang and Ho, Chiuman}, title = {Language-Guided Multi-Modal Fusion for Video Action Recognition}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3158-3162} }

Egocentric Biochemical Video-and-Language Dataset
Taichi Nishimura,
Kojiro Sakoda,
Atsushi Hashimoto,
Yoshitaka Ushiku,
Natsuko Tanaka,
Fumihito Ono,
Hirotaka Kameko,
Shinsuke Mori
[pdf]
[bibtex]
@InProceedings{Nishimura_2021_ICCV, author = {Nishimura, Taichi and Sakoda, Kojiro and Hashimoto, Atsushi and Ushiku, Yoshitaka and Tanaka, Natsuko and Ono, Fumihito and Kameko, Hirotaka and Mori, Shinsuke}, title = {Egocentric Biochemical Video-and-Language Dataset}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3129-3133} }

What You Say Is Not What You Do: Studying Visio-Linguistic Models for TV Series Summarization
Alison Reboud,
Raphael Troncy
[pdf]
[bibtex]
@InProceedings{Reboud_2021_ICCV, author = {Reboud, Alison and Troncy, Raphael}, title = {What You Say Is Not What You Do: Studying Visio-Linguistic Models for TV Series Summarization}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3149-3153} }

Latent Variable Models for Visual Question Answering
Zixu Wang,
Yishu Miao,
Lucia Specia
[pdf] [arXiv]
[bibtex]
@InProceedings{Wang_2021_ICCV, author = {Wang, Zixu and Miao, Yishu and Specia, Lucia}, title = {Latent Variable Models for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3144-3148} }