GRAIL-V: Grounded Retrieval & Agentic Intelligence for Vision-Language
BRIDGE: Multimodal-to-Text Retrieval via Reinforcement-Learned Query Alignment-
[pdf]
[arXiv]
[bibtex]@InProceedings{Mounis_2026_CVPR, author = {Mounis, Mohamed Darwish and Mahmoud, Mohamed and Sedek, Shaimaa and Abdalla, Mahmoud and Kasem, Mahmoud SalahEldin and Abdallah, Abdelrahman and Kang, Hyun-Soo}, title = {BRIDGE: Multimodal-to-Text Retrieval via Reinforcement-Learned Query Alignment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11311-11320} }
CALIBRA: Calibration-Aware Multi-Agent Verification for Contactless Physiological Monitoring-
[pdf]
[bibtex]@InProceedings{Sakib_2026_CVPR, author = {Sakib, Shadman and Shinde, Gaurav and Roy, Nirmalya}, title = {CALIBRA: Calibration-Aware Multi-Agent Verification for Contactless Physiological Monitoring}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11372-11382} }
Learning to Mix Flat and Curved Representations for Vision-Language Retrieval-
[pdf]
[bibtex]@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {Learning to Mix Flat and Curved Representations for Vision-Language Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11383-11387} }
CMAG: Concept-Scaffolded Retrieval for Marketplace Avatar Generation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Goel_2026_CVPR, author = {Goel, Rajeev and Ding, Jason and Wajjala, Phani Harish and Turaga, Pavan and Gowda, Tejaswi and Garikipati, Krishna C.}, title = {CMAG: Concept-Scaffolded Retrieval for Marketplace Avatar Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11211-11219} }
ViSS-R1: Self-Supervised Reinforcement Video Reasoning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Fang_2026_CVPR, author = {Fang, Bo and Song, YuXin and Sun, Haoyuan and Zhang, Xinyao and Wu, Qiangqiang and Wu, Wenhao and Chan, Antoni B.}, title = {ViSS-R1: Self-Supervised Reinforcement Video Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11190-11200} }
CoCoA-DVC: Consistency and Concept Aware Training for Dense Video Captioning-
[pdf]
[supp]
[bibtex]@InProceedings{Paranjape_2026_CVPR, author = {Paranjape, Jay and Guo, Yue and venkataraman, sankar and Patel, Vishal M. and Jammalamadaka, Nataraj}, title = {CoCoA-DVC: Consistency and Concept Aware Training for Dense Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11343-11352} }
SlotVTG: Object-Centric Adapter for Generalizable Video Temporal Grounding-
[pdf]
[arXiv]
[bibtex]@InProceedings{Han_2026_CVPR, author = {Han, Jiwook and Ahn, Geo and Kim, Youngrae and Choi, Jinwoo}, title = {SlotVTG: Object-Centric Adapter for Generalizable Video Temporal Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11230-11240} }
A Sanity Check on Composed Image Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2026_CVPR, author = {Liu, Yikun and Yao, Jiangchao and Xie, Weidi and Wang, Yanfeng}, title = {A Sanity Check on Composed Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11292-11301} }
Towards Robust Zero-Shot Video Temporal Grounding-
[pdf]
[bibtex]@InProceedings{Banditakkarakul_2026_CVPR, author = {Banditakkarakul, Nutthadech and Chen, Bo and Gould, Stephen}, title = {Towards Robust Zero-Shot Video Temporal Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11143-11152} }
Gaze-Regularized Vision-Language-Action Models for Robotic Manipulation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pani_2026_CVPR, author = {Pani, Anupam and Yang, Yanchao}, title = {Gaze-Regularized Vision-Language-Action Models for Robotic Manipulation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11332-11342} }
RAGENT: Robust Optimization for Grounded Vision-Language Retrieval-
[pdf]
[bibtex]@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {RAGENT: Robust Optimization for Grounded Vision-Language Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11388-11398} }
Investigating VLM Hallucination from a Cognitive Psychology Perspective: A First Step Toward Interpretation with Intriguing Observations-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2026_CVPR, author = {Liu, Xiangrui and Luo, Man and Chatterjee, Agneet and Wei, Hua and Baral, Chitta and Yang, Yezhou}, title = {Investigating VLM Hallucination from a Cognitive Psychology Perspective: A First Step Toward Interpretation with Intriguing Observations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11274-11283} }
EFSA: Episodic Few-Shot Adaptation for Text-to-Image Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Huzaifa_2026_CVPR, author = {Huzaifa, Muhammad and Kementchedjhieva, Yova}, title = {EFSA: Episodic Few-Shot Adaptation for Text-to-Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11241-11250} }
Negation Matters: Training-Free Negation-Aware Image Retrieval-
[pdf]
[bibtex]@InProceedings{Pokhrel_2026_CVPR, author = {Pokhrel, Aashish and Sheshappanavar, Shivanand Venkanna}, title = {Negation Matters: Training-Free Negation-Aware Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11353-11362} }
Neural-Symbolic Intention Refinement with User Feedback for Text-to-Image Retrieval-
[pdf]
[bibtex]@InProceedings{Bai_2026_CVPR, author = {Bai, Yu and Zhang, Lei and Hu, Xiaoyan and Zhu, Feng and Zhao, Rui}, title = {Neural-Symbolic Intention Refinement with User Feedback for Text-to-Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11133-11142} }
EVICT: Evidence-Sufficiency Verification via Counterfactual Dropout for Visually-Grounded Selective Question Answering-
[pdf]
[bibtex]@InProceedings{Kotte_2026_CVPR, author = {Kotte, Varun}, title = {EVICT: Evidence-Sufficiency Verification via Counterfactual Dropout for Visually-Grounded Selective Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11251-11259} }
LLM-Guided Agentic Floor Plan Parsing for Accessible Indoor Navigation of Blind and Low-Vision People-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ayanzadeh_2026_CVPR, author = {Ayanzadeh, Aydin and Oates, Tim}, title = {LLM-Guided Agentic Floor Plan Parsing for Accessible Indoor Navigation of Blind and Low-Vision People}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11124-11132} }
DualProc: Dual-Process Prompting Reduces Confident Errors in Vision-Language Models for Grounded Retrieval and Agentic Pipelines-
[pdf]
[bibtex]@InProceedings{Bansal_2026_CVPR, author = {Bansal, Aayam and Gangwani, Ishaan}, title = {DualProc: Dual-Process Prompting Reduces Confident Errors in Vision-Language Models for Grounded Retrieval and Agentic Pipelines}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11153-11160} }
ChatUMM: Robust Context Tracking for Conversational Interleaved Generation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Dai_2026_CVPR, author = {Dai, Wenxun and Zhao, Zhiyuan and Zhong, Yule and Cheng, Yiji and Zhang, Jianwei and Wang, Linqing and Zhang, Shiyi and Lin, Yunlong and He, Runze and Song, Fellix and Zhuang, Wayne and Liu, Yong and Zhang, Haoji and Tang, Yansong and Wang, Chunyu}, title = {ChatUMM: Robust Context Tracking for Conversational Interleaved Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11180-11189} }
CropVLM: Learning to Zoom for Fine-Grained Vision-Language Perception-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Carvalho_2026_CVPR, author = {Carvalho, Miguel and Dias, H\'elder and Martins, Bruno}, title = {CropVLM: Learning to Zoom for Fine-Grained Vision-Language Perception}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11170-11179} }
HTEF: Holistic Brand-Theme Alignment Scoring as a Catalog Gate for Grounded Conversational Recommendation-
[pdf]
[bibtex]@InProceedings{Rahman_2026_CVPR, author = {Rahman, Mahmudur and Garg, Dhruv and Rathod, Rishabh and Bindle, Sanket}, title = {HTEF: Holistic Brand-Theme Alignment Scoring as a Catalog Gate for Grounded Conversational Recommendation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11363-11371} }
SHOE - Semantic HOI Open-vocabulary Evaluation metric-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Noack_2026_CVPR, author = {Noack, Maja and Lei, Qinqian and Tian, Taipeng and Dong, Bihan and Tan, Robby T. and Chen, Yixin and Young, John and Zhang, Saijun and Wang, Bo}, title = {SHOE - Semantic HOI Open-vocabulary Evaluation metric}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11321-11331} }
Emotional Vocabulary as Semantic Grounding: How Language Register Affects Diffusion Efficiency in Image-to-Video Generation-
[pdf]
[supp]
[bibtex]@InProceedings{Boudreaux_2026_CVPR, author = {Boudreaux, Scott}, title = {Emotional Vocabulary as Semantic Grounding: How Language Register Affects Diffusion Efficiency in Image-to-Video Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11161-11169} }
A Multi-Agent Framework for Grounding Medical AI in Expert Clinical Knowledge beyond Textual Retrieval-
[pdf]
[bibtex]@InProceedings{Urooj_2026_CVPR, author = {Urooj, Midhat and Banerjee, Ayan and Gupta, Sandeep}, title = {A Multi-Agent Framework for Grounding Medical AI in Expert Clinical Knowledge beyond Textual Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11399-11409} }
Seeing without Looking: Do Vision-Language Benchmarks Really Test Vision?-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lan_2026_CVPR, author = {Lan, Zixuan and Sun, Luzhe and Walter, Matthew R and Zhou, Jiawei}, title = {Seeing without Looking: Do Vision-Language Benchmarks Really Test Vision?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11260-11273} }
RePlan-Bot: Multi-Level Replanning for Embodied Instruction Following-
[pdf]
[supp]
[bibtex]@InProceedings{Gong_2026_CVPR, author = {Gong, Xicheng and Sun, Guozheng and Xu, Peiran and Mu, Yadong}, title = {RePlan-Bot: Multi-Level Replanning for Embodied Instruction Following}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11220-11229} }
Knowledge or Action? Automation Boundary Prediction with Intent Discovery and Knowledge Use-Case Enablement for Agentic Enterprise Support-
[pdf]
[supp]
[bibtex]@InProceedings{Mayank_2026_CVPR, author = {Mayank, Kumar and Sahu, Ipseeta and Jaganathan, Sajeetha}, title = {Knowledge or Action? Automation Boundary Prediction with Intent Discovery and Knowledge Use-Case Enablement for Agentic Enterprise Support}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11302-11310} }
Towards Context-Aware Image Anonymization with Multi-Agent Reasoning-
[pdf]
[supp]
[bibtex]@InProceedings{Aufschlager_2026_CVPR, author = {Aufschl\"ager, Robert and Folz, Jakob and Savaliya, Gautam and Vidanalage, Manjitha D and Heigl, Michael and Schramm, Martin}, title = {Towards Context-Aware Image Anonymization with Multi-Agent Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11114-11123} }
CompAgent: An Agentic Framework for Visual Compliance Verification-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ghosh_2026_CVPR, author = {Ghosh, Rahul and Chaudhury, Baishali and Das, Hari Prasanna and Ashok, Meghana and Razkenari, Ryan and Chen, Long and Hong, Sungmin and Liu, Chun-Hao}, title = {CompAgent: An Agentic Framework for Visual Compliance Verification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11201-11210} }
Lightweight and Production-Ready PDF Visual Element Parsing-
[pdf]
[arXiv]
[bibtex]@InProceedings{Liu_2026_CVPR, author = {Liu, Meizhu and Abbasi, Yassi and Rowe, Matthew and Avendi, Michael and Li, Paul}, title = {Lightweight and Production-Ready PDF Visual Element Parsing}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11284-11291} }
HIVE: Query, Hypothesize, Verify -- A LLM Framework for Multimodal Reasoning-Intensive Retrieval-
[pdf]
[arXiv]
[bibtex]@InProceedings{Abdalla_2026_CVPR, author = {Abdalla, Mahmoud and Kasem, Mahmoud SalahEldin and Mahmoud, Mohamed and Senussi, Mostafa Farouk and Abdallah, Abdelrahman and Kang, Hyun-Soo}, title = {HIVE: Query, Hypothesize, Verify -- A LLM Framework for Multimodal Reasoning-Intensive Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11104-11113} }

