CVPR 2024 Open Access Repository

Multimodal Algorithmic Reasoning Workshop

Using Language-Aligned Gesture Embeddings for Understanding Gestures Accompanying Math Terms: Tristan Maidment,

Purav J Patel,

Erin Walker,

Adriana Kovashka; [pdf]
[bibtex]
@InProceedings{Maidment_2024_CVPR, author = {Maidment, Tristan and Patel, Purav J and Walker, Erin and Kovashka, Adriana}, title = {Using Language-Aligned Gesture Embeddings for Understanding Gestures Accompanying Math Terms}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2227-2237} }
Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models: Feipeng Ma,

Yizhou Zhou,

Yueyi Zhang,

Siying Wu,

Zheyu Zhang,

Zilong He,

Fengyun Rao,

Xiaoyan Sun; [pdf] [supp]
[bibtex]
@InProceedings{Ma_2024_CVPR, author = {Ma, Feipeng and Zhou, Yizhou and Zhang, Yueyi and Wu, Siying and Zhang, Zheyu and He, Zilong and Rao, Fengyun and Sun, Xiaoyan}, title = {Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2248-2257} }
What Does CLIP Know About Peeling a Banana?: Claudia Cuttano,

Gabriele Rosi,

Gabriele Trivigno,

Giuseppe Averta; [pdf] [arXiv]
[bibtex]
@InProceedings{Cuttano_2024_CVPR, author = {Cuttano, Claudia and Rosi, Gabriele and Trivigno, Gabriele and Averta, Giuseppe}, title = {What Does CLIP Know About Peeling a Banana?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2238-2247} }
Multi-Explainable TemporalNet: An Interpretable Multimodal Approach using Temporal Convolutional Network for User-level Depression Detection: Anas Zafar,

Danyal Aftab,

Rizwan Qureshi,

Yaofeng Wang,

Hong Yan; [pdf] [supp]
[bibtex]
@InProceedings{Zafar_2024_CVPR, author = {Zafar, Anas and Aftab, Danyal and Qureshi, Rizwan and Wang, Yaofeng and Yan, Hong}, title = {Multi-Explainable TemporalNet: An Interpretable Multimodal Approach using Temporal Convolutional Network for User-level Depression Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2258-2265} }
Spatial Representations in Multimodal AI Systems: Scott O. Murray,

Bridget Leonard; [pdf]
[bibtex]
@InProceedings{Murray_2024_CVPR, author = {Murray, Scott O. and Leonard, Bridget}, title = {Spatial Representations in Multimodal AI Systems}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8256-8259} }
ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System: Md Adnan Arefeen,

Biplob Debnath,

Md Yusuf Sarwar Uddin,

Srimat Chakradhar; [pdf]
[bibtex]
@InProceedings{Arefeen_2024_CVPR, author = {Arefeen, Md Adnan and Debnath, Biplob and Uddin, Md Yusuf Sarwar and Chakradhar, Srimat}, title = {ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2266-2274} }