Multimodal Algorithmic Reasoning Workshop
Using Language-Aligned Gesture Embeddings for Understanding Gestures Accompanying Math Terms-
[pdf]
[bibtex]@InProceedings{Maidment_2024_CVPR, author = {Maidment, Tristan and Patel, Purav J and Walker, Erin and Kovashka, Adriana}, title = {Using Language-Aligned Gesture Embeddings for Understanding Gestures Accompanying Math Terms}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2227-2237} }
Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models-
[pdf]
[supp]
[bibtex]@InProceedings{Ma_2024_CVPR, author = {Ma, Feipeng and Zhou, Yizhou and Zhang, Yueyi and Wu, Siying and Zhang, Zheyu and He, Zilong and Rao, Fengyun and Sun, Xiaoyan}, title = {Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2248-2257} }
What Does CLIP Know About Peeling a Banana?-
[pdf]
[arXiv]
[bibtex]@InProceedings{Cuttano_2024_CVPR, author = {Cuttano, Claudia and Rosi, Gabriele and Trivigno, Gabriele and Averta, Giuseppe}, title = {What Does CLIP Know About Peeling a Banana?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2238-2247} }
Multi-Explainable TemporalNet: An Interpretable Multimodal Approach using Temporal Convolutional Network for User-level Depression Detection-
[pdf]
[supp]
[bibtex]@InProceedings{Zafar_2024_CVPR, author = {Zafar, Anas and Aftab, Danyal and Qureshi, Rizwan and Wang, Yaofeng and Yan, Hong}, title = {Multi-Explainable TemporalNet: An Interpretable Multimodal Approach using Temporal Convolutional Network for User-level Depression Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2258-2265} }
Spatial Representations in Multimodal AI Systems-
[pdf]
[bibtex]@InProceedings{Murray_2024_CVPR, author = {Murray, Scott O. and Leonard, Bridget}, title = {Spatial Representations in Multimodal AI Systems}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8256-8259} }
ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System-
[pdf]
[bibtex]@InProceedings{Arefeen_2024_CVPR, author = {Arefeen, Md Adnan and Debnath, Biplob and Uddin, Md Yusuf Sarwar and Chakradhar, Srimat}, title = {ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {2266-2274} }