CVPR 2026 Open Access Repository

GRAIL-V: Grounded Retrieval & Agentic Intelligence for Vision-Language

BRIDGE: Multimodal-to-Text Retrieval via Reinforcement-Learned Query Alignment: Mohamed Darwish Mounis,

Mohamed Mahmoud,

Shaimaa Sedek,

Mahmoud Abdalla,

Mahmoud SalahEldin Kasem,

Abdelrahman Abdallah,

Hyun-Soo Kang; [pdf] [arXiv]
[bibtex]
@InProceedings{Mounis_2026_CVPR, author = {Mounis, Mohamed Darwish and Mahmoud, Mohamed and Sedek, Shaimaa and Abdalla, Mahmoud and Kasem, Mahmoud SalahEldin and Abdallah, Abdelrahman and Kang, Hyun-Soo}, title = {BRIDGE: Multimodal-to-Text Retrieval via Reinforcement-Learned Query Alignment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11311-11320} }
CALIBRA: Calibration-Aware Multi-Agent Verification for Contactless Physiological Monitoring: Shadman Sakib,

Gaurav Shinde,

Nirmalya Roy; [pdf]
[bibtex]
@InProceedings{Sakib_2026_CVPR, author = {Sakib, Shadman and Shinde, Gaurav and Roy, Nirmalya}, title = {CALIBRA: Calibration-Aware Multi-Agent Verification for Contactless Physiological Monitoring}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11372-11382} }
Learning to Mix Flat and Curved Representations for Vision-Language Retrieval: Sarthak Srivastava,

Kathy Wu; [pdf]
[bibtex]
@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {Learning to Mix Flat and Curved Representations for Vision-Language Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11383-11387} }
CMAG: Concept-Scaffolded Retrieval for Marketplace Avatar Generation: Rajeev Goel,

Jason Ding,

Phani Harish Wajjala,

Pavan Turaga,

Tejaswi Gowda,

Krishna C. Garikipati; [pdf] [arXiv]
[bibtex]
@InProceedings{Goel_2026_CVPR, author = {Goel, Rajeev and Ding, Jason and Wajjala, Phani Harish and Turaga, Pavan and Gowda, Tejaswi and Garikipati, Krishna C.}, title = {CMAG: Concept-Scaffolded Retrieval for Marketplace Avatar Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11211-11219} }
ViSS-R1: Self-Supervised Reinforcement Video Reasoning: Bo Fang,

YuXin Song,

Haoyuan Sun,

Xinyao Zhang,

Qiangqiang Wu,

Wenhao Wu,

Antoni B. Chan; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Fang_2026_CVPR, author = {Fang, Bo and Song, YuXin and Sun, Haoyuan and Zhang, Xinyao and Wu, Qiangqiang and Wu, Wenhao and Chan, Antoni B.}, title = {ViSS-R1: Self-Supervised Reinforcement Video Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11190-11200} }
CoCoA-DVC: Consistency and Concept Aware Training for Dense Video Captioning: Jay Paranjape,

Yue Guo,

sankar venkataraman,

Vishal M. Patel,

Nataraj Jammalamadaka; [pdf] [supp]
[bibtex]
@InProceedings{Paranjape_2026_CVPR, author = {Paranjape, Jay and Guo, Yue and venkataraman, sankar and Patel, Vishal M. and Jammalamadaka, Nataraj}, title = {CoCoA-DVC: Consistency and Concept Aware Training for Dense Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11343-11352} }
SlotVTG: Object-Centric Adapter for Generalizable Video Temporal Grounding: Jiwook Han,

Geo Ahn,

Youngrae Kim,

Jinwoo Choi; [pdf] [arXiv]
[bibtex]
@InProceedings{Han_2026_CVPR, author = {Han, Jiwook and Ahn, Geo and Kim, Youngrae and Choi, Jinwoo}, title = {SlotVTG: Object-Centric Adapter for Generalizable Video Temporal Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11230-11240} }
A Sanity Check on Composed Image Retrieval: Yikun Liu,

Jiangchao Yao,

Weidi Xie,

Yanfeng Wang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Liu_2026_CVPR, author = {Liu, Yikun and Yao, Jiangchao and Xie, Weidi and Wang, Yanfeng}, title = {A Sanity Check on Composed Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11292-11301} }
Towards Robust Zero-Shot Video Temporal Grounding: Nutthadech Banditakkarakul,

Bo Chen,

Stephen Gould; [pdf]
[bibtex]
@InProceedings{Banditakkarakul_2026_CVPR, author = {Banditakkarakul, Nutthadech and Chen, Bo and Gould, Stephen}, title = {Towards Robust Zero-Shot Video Temporal Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11143-11152} }
Gaze-Regularized Vision-Language-Action Models for Robotic Manipulation: Anupam Pani,

Yanchao Yang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Pani_2026_CVPR, author = {Pani, Anupam and Yang, Yanchao}, title = {Gaze-Regularized Vision-Language-Action Models for Robotic Manipulation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11332-11342} }
RAGENT: Robust Optimization for Grounded Vision-Language Retrieval: Sarthak Srivastava,

Kathy Wu; [pdf]
[bibtex]
@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {RAGENT: Robust Optimization for Grounded Vision-Language Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11388-11398} }
Investigating VLM Hallucination from a Cognitive Psychology Perspective: A First Step Toward Interpretation with Intriguing Observations: Xiangrui Liu,

Man Luo,

Agneet Chatterjee,

Hua Wei,

Chitta Baral,

Yezhou Yang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Liu_2026_CVPR, author = {Liu, Xiangrui and Luo, Man and Chatterjee, Agneet and Wei, Hua and Baral, Chitta and Yang, Yezhou}, title = {Investigating VLM Hallucination from a Cognitive Psychology Perspective: A First Step Toward Interpretation with Intriguing Observations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11274-11283} }
EFSA: Episodic Few-Shot Adaptation for Text-to-Image Retrieval: Muhammad Huzaifa,

Yova Kementchedjhieva; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Huzaifa_2026_CVPR, author = {Huzaifa, Muhammad and Kementchedjhieva, Yova}, title = {EFSA: Episodic Few-Shot Adaptation for Text-to-Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11241-11250} }
Negation Matters: Training-Free Negation-Aware Image Retrieval: Aashish Pokhrel,

Shivanand Venkanna Sheshappanavar; [pdf]
[bibtex]
@InProceedings{Pokhrel_2026_CVPR, author = {Pokhrel, Aashish and Sheshappanavar, Shivanand Venkanna}, title = {Negation Matters: Training-Free Negation-Aware Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11353-11362} }
Neural-Symbolic Intention Refinement with User Feedback for Text-to-Image Retrieval: Yu Bai,

Lei Zhang,

Xiaoyan Hu,

Feng Zhu,

Rui Zhao; [pdf]
[bibtex]
@InProceedings{Bai_2026_CVPR, author = {Bai, Yu and Zhang, Lei and Hu, Xiaoyan and Zhu, Feng and Zhao, Rui}, title = {Neural-Symbolic Intention Refinement with User Feedback for Text-to-Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11133-11142} }
EVICT: Evidence-Sufficiency Verification via Counterfactual Dropout for Visually-Grounded Selective Question Answering: Varun Kotte; [pdf]
[bibtex]
@InProceedings{Kotte_2026_CVPR, author = {Kotte, Varun}, title = {EVICT: Evidence-Sufficiency Verification via Counterfactual Dropout for Visually-Grounded Selective Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11251-11259} }
LLM-Guided Agentic Floor Plan Parsing for Accessible Indoor Navigation of Blind and Low-Vision People: Aydin Ayanzadeh,

Tim Oates; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Ayanzadeh_2026_CVPR, author = {Ayanzadeh, Aydin and Oates, Tim}, title = {LLM-Guided Agentic Floor Plan Parsing for Accessible Indoor Navigation of Blind and Low-Vision People}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11124-11132} }
DualProc: Dual-Process Prompting Reduces Confident Errors in Vision-Language Models for Grounded Retrieval and Agentic Pipelines: Aayam Bansal,

Ishaan Gangwani; [pdf]
[bibtex]
@InProceedings{Bansal_2026_CVPR, author = {Bansal, Aayam and Gangwani, Ishaan}, title = {DualProc: Dual-Process Prompting Reduces Confident Errors in Vision-Language Models for Grounded Retrieval and Agentic Pipelines}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11153-11160} }
ChatUMM: Robust Context Tracking for Conversational Interleaved Generation: Wenxun Dai,

Zhiyuan Zhao,

Yule Zhong,

Yiji Cheng,

Jianwei Zhang,

Linqing Wang,

Shiyi Zhang,

Yunlong Lin,

Runze He,

Fellix Song,

Wayne Zhuang,

Yong Liu,

Haoji Zhang,

Yansong Tang,

Chunyu Wang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Dai_2026_CVPR, author = {Dai, Wenxun and Zhao, Zhiyuan and Zhong, Yule and Cheng, Yiji and Zhang, Jianwei and Wang, Linqing and Zhang, Shiyi and Lin, Yunlong and He, Runze and Song, Fellix and Zhuang, Wayne and Liu, Yong and Zhang, Haoji and Tang, Yansong and Wang, Chunyu}, title = {ChatUMM: Robust Context Tracking for Conversational Interleaved Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11180-11189} }
CropVLM: Learning to Zoom for Fine-Grained Vision-Language Perception: Miguel Carvalho,

Hélder Dias,

Bruno Martins; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Carvalho_2026_CVPR, author = {Carvalho, Miguel and Dias, H\'elder and Martins, Bruno}, title = {CropVLM: Learning to Zoom for Fine-Grained Vision-Language Perception}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11170-11179} }
HTEF: Holistic Brand-Theme Alignment Scoring as a Catalog Gate for Grounded Conversational Recommendation: Mahmudur Rahman,

Dhruv Garg,

Rishabh Rathod,

Sanket Bindle; [pdf]
[bibtex]
@InProceedings{Rahman_2026_CVPR, author = {Rahman, Mahmudur and Garg, Dhruv and Rathod, Rishabh and Bindle, Sanket}, title = {HTEF: Holistic Brand-Theme Alignment Scoring as a Catalog Gate for Grounded Conversational Recommendation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11363-11371} }
SHOE - Semantic HOI Open-vocabulary Evaluation metric: Maja Noack,

Qinqian Lei,

Taipeng Tian,

Bihan Dong,

Robby T. Tan,

Yixin Chen,

John Young,

Saijun Zhang,

Bo Wang; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Noack_2026_CVPR, author = {Noack, Maja and Lei, Qinqian and Tian, Taipeng and Dong, Bihan and Tan, Robby T. and Chen, Yixin and Young, John and Zhang, Saijun and Wang, Bo}, title = {SHOE - Semantic HOI Open-vocabulary Evaluation metric}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11321-11331} }
Emotional Vocabulary as Semantic Grounding: How Language Register Affects Diffusion Efficiency in Image-to-Video Generation: Scott Boudreaux; [pdf] [supp]
[bibtex]
@InProceedings{Boudreaux_2026_CVPR, author = {Boudreaux, Scott}, title = {Emotional Vocabulary as Semantic Grounding: How Language Register Affects Diffusion Efficiency in Image-to-Video Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11161-11169} }
A Multi-Agent Framework for Grounding Medical AI in Expert Clinical Knowledge beyond Textual Retrieval: Midhat Urooj,

Ayan Banerjee,

Sandeep Gupta; [pdf]
[bibtex]
@InProceedings{Urooj_2026_CVPR, author = {Urooj, Midhat and Banerjee, Ayan and Gupta, Sandeep}, title = {A Multi-Agent Framework for Grounding Medical AI in Expert Clinical Knowledge beyond Textual Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11399-11409} }
Seeing without Looking: Do Vision-Language Benchmarks Really Test Vision?: Zixuan Lan,

Luzhe Sun,

Matthew R Walter,

Jiawei Zhou; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lan_2026_CVPR, author = {Lan, Zixuan and Sun, Luzhe and Walter, Matthew R and Zhou, Jiawei}, title = {Seeing without Looking: Do Vision-Language Benchmarks Really Test Vision?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11260-11273} }
RePlan-Bot: Multi-Level Replanning for Embodied Instruction Following: Xicheng Gong,

Guozheng Sun,

Peiran Xu,

Yadong Mu; [pdf] [supp]
[bibtex]
@InProceedings{Gong_2026_CVPR, author = {Gong, Xicheng and Sun, Guozheng and Xu, Peiran and Mu, Yadong}, title = {RePlan-Bot: Multi-Level Replanning for Embodied Instruction Following}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11220-11229} }
Knowledge or Action? Automation Boundary Prediction with Intent Discovery and Knowledge Use-Case Enablement for Agentic Enterprise Support: Kumar Mayank,

Ipseeta Sahu,

Sajeetha Jaganathan; [pdf] [supp]
[bibtex]
@InProceedings{Mayank_2026_CVPR, author = {Mayank, Kumar and Sahu, Ipseeta and Jaganathan, Sajeetha}, title = {Knowledge or Action? Automation Boundary Prediction with Intent Discovery and Knowledge Use-Case Enablement for Agentic Enterprise Support}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11302-11310} }
Towards Context-Aware Image Anonymization with Multi-Agent Reasoning: Robert Aufschläger,

Jakob Folz,

Gautam Savaliya,

Manjitha D Vidanalage,

Michael Heigl,

Martin Schramm; [pdf] [supp]
[bibtex]
@InProceedings{Aufschlager_2026_CVPR, author = {Aufschl\"ager, Robert and Folz, Jakob and Savaliya, Gautam and Vidanalage, Manjitha D and Heigl, Michael and Schramm, Martin}, title = {Towards Context-Aware Image Anonymization with Multi-Agent Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11114-11123} }
CompAgent: An Agentic Framework for Visual Compliance Verification: Rahul Ghosh,

Baishali Chaudhury,

Hari Prasanna Das,

Meghana Ashok,

Ryan Razkenari,

Long Chen,

Sungmin Hong,

Chun-Hao Liu; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Ghosh_2026_CVPR, author = {Ghosh, Rahul and Chaudhury, Baishali and Das, Hari Prasanna and Ashok, Meghana and Razkenari, Ryan and Chen, Long and Hong, Sungmin and Liu, Chun-Hao}, title = {CompAgent: An Agentic Framework for Visual Compliance Verification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11201-11210} }
Lightweight and Production-Ready PDF Visual Element Parsing: Meizhu Liu,

Yassi Abbasi,

Matthew Rowe,

Michael Avendi,

Paul Li; [pdf] [arXiv]
[bibtex]
@InProceedings{Liu_2026_CVPR, author = {Liu, Meizhu and Abbasi, Yassi and Rowe, Matthew and Avendi, Michael and Li, Paul}, title = {Lightweight and Production-Ready PDF Visual Element Parsing}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11284-11291} }
HIVE: Query, Hypothesize, Verify -- A LLM Framework for Multimodal Reasoning-Intensive Retrieval: Mahmoud Abdalla,

Mahmoud SalahEldin Kasem,

Mohamed Mahmoud,

Mostafa Farouk Senussi,

Abdelrahman Abdallah,

Hyun-Soo Kang; [pdf] [arXiv]
[bibtex]
@InProceedings{Abdalla_2026_CVPR, author = {Abdalla, Mahmoud and Kasem, Mahmoud SalahEldin and Mahmoud, Mohamed and Senussi, Mostafa Farouk and Abdallah, Abdelrahman and Kang, Hyun-Soo}, title = {HIVE: Query, Hypothesize, Verify -- A LLM Framework for Multimodal Reasoning-Intensive Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11104-11113} }