Multimodal Algorithmic Reasoning Workshop


Using Language-Aligned Gesture Embeddings for Understanding Gestures Accompanying Math Terms
Tristan Maidment,
Purav J Patel,
Erin Walker,
Adriana Kovashka
[pdf]
[bibtex]
@InProceedings{Maidment_2024_CVPR, author = {Maidment, Tristan and Patel, Purav J and Walker, Erin and Kovashka, Adriana}, title = {Using Language-Aligned Gesture Embeddings for Understanding Gestures Accompanying Math Terms}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2227-2237} }

Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models
Feipeng Ma,
Yizhou Zhou,
Yueyi Zhang,
Siying Wu,
Zheyu Zhang,
Zilong He,
Fengyun Rao,
Xiaoyan Sun
[pdf] [supp]
[bibtex]
@InProceedings{Ma_2024_CVPR, author = {Ma, Feipeng and Zhou, Yizhou and Zhang, Yueyi and Wu, Siying and Zhang, Zheyu and He, Zilong and Rao, Fengyun and Sun, Xiaoyan}, title = {Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2248-2257} }

What Does CLIP Know About Peeling a Banana?
Claudia Cuttano,
Gabriele Rosi,
Gabriele Trivigno,
Giuseppe Averta
[pdf] [arXiv]
[bibtex]
@InProceedings{Cuttano_2024_CVPR, author = {Cuttano, Claudia and Rosi, Gabriele and Trivigno, Gabriele and Averta, Giuseppe}, title = {What Does CLIP Know About Peeling a Banana?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2238-2247} }

Multi-Explainable TemporalNet: An Interpretable Multimodal Approach using Temporal Convolutional Network for User-level Depression Detection
Anas Zafar,
Danyal Aftab,
Rizwan Qureshi,
Yaofeng Wang,
Hong Yan
[pdf] [supp]
[bibtex]
@InProceedings{Zafar_2024_CVPR, author = {Zafar, Anas and Aftab, Danyal and Qureshi, Rizwan and Wang, Yaofeng and Yan, Hong}, title = {Multi-Explainable TemporalNet: An Interpretable Multimodal Approach using Temporal Convolutional Network for User-level Depression Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2258-2265} }

ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System
Md Adnan Arefeen,
Biplob Debnath,
Md Yusuf Sarwar Uddin,
Srimat Chakradhar
[pdf]
[bibtex]
@InProceedings{Arefeen_2024_CVPR, author = {Arefeen, Md Adnan and Debnath, Biplob and Uddin, Md Yusuf Sarwar and Chakradhar, Srimat}, title = {ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2266-2274} }