ICCV 2021 Open Access Repository

Closing the Loop Between Vision and Language

Visual Question Answering With Textual Representations for Images: Yusuke Hirota,

Noa Garcia,

Mayu Otani,

Chenhui Chu,

Yuta Nakashima,

Ittetsu Taniguchi,

Takao Onoye; [pdf] [supp]
[bibtex]
@InProceedings{Hirota_2021_ICCV, author = {Hirota, Yusuke and Garcia, Noa and Otani, Mayu and Chu, Chenhui and Nakashima, Yuta and Taniguchi, Ittetsu and Onoye, Takao}, title = {Visual Question Answering With Textual Representations for Images}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3154-3157} }
Semi-Autoregressive Transformer for Image Captioning: Yuanen Zhou,

Yong Zhang,

Zhenzhen Hu,

Meng Wang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Zhou_2021_ICCV, author = {Zhou, Yuanen and Zhang, Yong and Hu, Zhenzhen and Wang, Meng}, title = {Semi-Autoregressive Transformer for Image Captioning}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3139-3143} }
What You Say Is Not What You Do: Studying Visio-Linguistic Models for TV Series Summarization: Alison Reboud,

Raphaël Troncy; [pdf]
[bibtex]
@InProceedings{Reboud_2021_ICCV, author = {Reboud, Alison and Troncy, Rapha\"el}, title = {What You Say Is Not What You Do: Studying Visio-Linguistic Models for TV Series Summarization}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3149-3153} }
Egocentric Biochemical Video-and-Language Dataset: Taichi Nishimura,

Kojiro Sakoda,

Atsushi Hashimoto,

Yoshitaka Ushiku,

Natsuko Tanaka,

Fumihito Ono,

Hirotaka Kameko,

Shinsuke Mori; [pdf]
[bibtex]
@InProceedings{Nishimura_2021_ICCV, author = {Nishimura, Taichi and Sakoda, Kojiro and Hashimoto, Atsushi and Ushiku, Yoshitaka and Tanaka, Natsuko and Ono, Fumihito and Kameko, Hirotaka and Mori, Shinsuke}, title = {Egocentric Biochemical Video-and-Language Dataset}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3129-3133} }
CIGLI: Conditional Image Generation From Language & Image: Xiaopeng Lu,

Lynnette Ng,

Jared Fernandez,

Hao Zhu; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lu_2021_ICCV, author = {Lu, Xiaopeng and Ng, Lynnette and Fernandez, Jared and Zhu, Hao}, title = {CIGLI: Conditional Image Generation From Language \& Image}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3134-3138} }
Latent Variable Models for Visual Question Answering: Zixu Wang,

Yishu Miao,

Lucia Specia; [pdf] [arXiv]
[bibtex]
@InProceedings{Wang_2021_ICCV, author = {Wang, Zixu and Miao, Yishu and Specia, Lucia}, title = {Latent Variable Models for Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3144-3148} }
Language-Guided Multi-Modal Fusion for Video Action Recognition: Jenhao Hsiao,

Yikang Li,

Chiuman Ho; [pdf]
[bibtex]
@InProceedings{Hsiao_2021_ICCV, author = {Hsiao, Jenhao and Li, Yikang and Ho, Chiuman}, title = {Language-Guided Multi-Modal Fusion for Video Action Recognition}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2021}, pages = {3158-3162} }