AI4RWC: The 2nd International Workshop on Vision Intelligence for Real-world Challenges
Event-Level Detection of Surgical Instrument Handovers in Videos with Interpretable Vision Model-
[pdf]
[supp]
[bibtex]@InProceedings{Katsarou_2026_CVPR, author = {Katsarou, Katerina and Zountsas, George and Tomotaki-Dawoud, Karam and Ehrenhoefer, Alex and Chojecki, Paul and Przewozny, David and Runde, Detlef and Sauer, Igor Maximilian and Mouakher, Amira and Bosse, Sebastian}, title = {Event-Level Detection of Surgical Instrument Handovers in Videos with Interpretable Vision Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9184-9193} }
SinSEMI: A One-Shot Image Generation Model and Data-Efficient Evaluation Framework for Semiconductor Inspection Equipment-
[pdf]
[arXiv]
[bibtex]@InProceedings{Wu_2026_CVPR, author = {Wu, ChunLiang and Li, Xiaochun}, title = {SinSEMI: A One-Shot Image Generation Model and Data-Efficient Evaluation Framework for Semiconductor Inspection Equipment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9383-9391} }
How CT Window Configurations Affect the Grading Performance of 3D Radiology diagnostics-
[pdf]
[bibtex]@InProceedings{Peng_2026_CVPR, author = {Peng, Bo and Xu, Chao and Chen, Boyu and Shi, Daqian}, title = {How CT Window Configurations Affect the Grading Performance of 3D Radiology diagnostics}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9270-9279} }
Improving Efficiency and Reliability of Computer Vision Models in Real-World Deployment with MX Quantization-
[pdf]
[bibtex]@InProceedings{Wen_2026_CVPR, author = {Wen, Jinghao and Wang, Ruixuan and Wang, Shaohuang and Jiao, Xun}, title = {Improving Efficiency and Reliability of Computer Vision Models in Real-World Deployment with MX Quantization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9373-9382} }
SynthPID: P&ID digitization from Topology-Preserving Synthetic Data-
[pdf]
[arXiv]
[bibtex]@InProceedings{Prasad_2026_CVPR, author = {Prasad, Suraj and Mahapatra, Pinak}, title = {SynthPID: P\&ID digitization from Topology-Preserving Synthetic Data}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9280-9286} }
Boxes2Pixels: Learning Defect Segmentation from Noisy SAM Masks-
[pdf]
[bibtex]@InProceedings{Lendering_2026_CVPR, author = {Lendering, Camile and Akdag, Erkut and Bondarau, Egor}, title = {Boxes2Pixels: Learning Defect Segmentation from Noisy SAM Masks}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9203-9212} }
Towards AI Alignment for Medical Imaging: A Unified Framework for Cross-Modal Fairness, Uncertainty Quantification, and Robustness-
[pdf]
[bibtex]@InProceedings{Patra_2026_CVPR, author = {Patra, Arijit}, title = {Towards AI Alignment for Medical Imaging: A Unified Framework for Cross-Modal Fairness, Uncertainty Quantification, and Robustness}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9261-9269} }
Learning Adaptive Geometry for Robust Real-World Vision-Language Understanding-
[pdf]
[bibtex]@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {Learning Adaptive Geometry for Robust Real-World Vision-Language Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9357-9361} }
Cross-Source Supervision for Bone Infection Segmentation in Dual-Modality PET-CT-
[pdf]
[arXiv]
[bibtex]@InProceedings{Yang_2026_CVPR, author = {Yang, Zonglin and Diao, Xiaolei and Chen, Jishizhan and Xiaozhuang, Man and Kong, Wei and Wen, Gen and Cheng, Pengfei and Shi, Daqian}, title = {Cross-Source Supervision for Bone Infection Segmentation in Dual-Modality PET-CT}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9411-9420} }
Log-Gated Dual-Path Conditioning Framework for Multimodal Geological Core Image Generation-
[pdf]
[bibtex]@InProceedings{Shi_2026_CVPR, author = {Shi, Daqian and John, Cedric M}, title = {Log-Gated Dual-Path Conditioning Framework for Multimodal Geological Core Image Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9315-9324} }
Zero-Shot Polyp Detection in Open-World Endoscopy via A Cascaded Detector-Verifier Framework-
[pdf]
[bibtex]@InProceedings{Xu_2026_CVPR, author = {Xu, Shengkai and Wang, Junqiao and Wu, Kunyu and Wan, Yimeng and Ouyang, Yuqi}, title = {Zero-Shot Polyp Detection in Open-World Endoscopy via A Cascaded Detector-Verifier Framework}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9402-9410} }
Z-IRIS: Zero-Shot Road Presence Detection in Aerial Tiles via Segment Proposals and Vision-Language Scoring-
[pdf]
[bibtex]@InProceedings{Horvath_2026_CVPR, author = {Horvath, Janos}, title = {Z-IRIS: Zero-Shot Road Presence Detection in Aerial Tiles via Segment Proposals and Vision-Language Scoring}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9155-9164} }
Stop Position Ranking: Safety Benchmarking of Automated Drive Systems for Picking up Passengers-
[pdf]
[bibtex]@InProceedings{Margueritte_2026_CVPR, author = {Margueritte, Ga\"etan and Arpanantikul, Pacharapon and Kondapally, Anirudh and Runte, Benedict Magnus and Yamada, Kentaro and Yanaka, Hitomi}, title = {Stop Position Ranking: Safety Benchmarking of Automated Drive Systems for Picking up Passengers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9233-9240} }
MOTOR-Bench: A Real-world Dataset and Multi-agent Framework for Zero-shot Human Mental State Understanding-
[pdf]
[bibtex]@InProceedings{Yuan_2026_CVPR, author = {Yuan, Xiaoyu and Heikkala, Niklas and T\"orm\"anen, Tiina and J\"arvenoja, Hanna and Zhao, Guoying and Chen, Haoyu}, title = {MOTOR-Bench: A Real-world Dataset and Multi-agent Framework for Zero-shot Human Mental State Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9431-9439} }
VisChainBench: Benchmarking Multi-Image, Multi-Turn Visual Reasoning Beyond Language Priors for Real-World AI Challenges-
[pdf]
[bibtex]@InProceedings{Lyu_2026_CVPR, author = {Lyu, Wenbo and Du, Yingjun and Zhao, Jinglin and Zhen, Xiantong and Shao, Ling}, title = {VisChainBench: Benchmarking Multi-Image, Multi-Turn Visual Reasoning Beyond Language Priors for Real-World AI Challenges}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9223-9232} }
GazeVLM: A Vision-Language Model for Multi-Task Gaze Understanding-
[pdf]
[bibtex]@InProceedings{Mathew_2026_CVPR, author = {Mathew, Athul M and Hermassi, Haithem and Kadavil, Thariq and Khan, Arshad Ali}, title = {GazeVLM: A Vision-Language Model for Multi-Task Gaze Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9241-9250} }
Refining Image-to-3D Foundation Models via Geometric Supervision for Industrial Plant Reconstruction-
[pdf]
[bibtex]@InProceedings{Lee_2026_CVPR, author = {Lee, SangEun and Chae, Wonseok and Yoo, Hoyoung and Kim, Geunyong and Kim, NackWoo and Kim, Hyeonjin}, title = {Refining Image-to-3D Foundation Models via Geometric Supervision for Industrial Plant Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9194-9202} }
Opinion Mining and Dynamic Topic Modeling in Streaming Public Discourse with LLMs-
[pdf]
[bibtex]@InProceedings{Bo_2026_CVPR, author = {Bo, Rite and Guo, Deming and Zhang, Hongda and Zhang, Zhiqi and Shi, Lida and Xu, Hao and Shi, Daqian}, title = {Opinion Mining and Dynamic Topic Modeling in Streaming Public Discourse with LLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9080-9089} }
DRCoD: Toward Robust Continual Learning of Diffusion Models for Tire Manufacturing Prototyping-
[pdf]
[supp]
[bibtex]@InProceedings{Shin_2026_CVPR, author = {Shin, Jisu and Lee, Sol and Hong, Sungrae and Kim, A young and You, Youngbin and Park, Jeongheon and Oh, Jungsoo and Yi, Mun}, title = {DRCoD: Toward Robust Continual Learning of Diffusion Models for Tire Manufacturing Prototyping}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9335-9344} }
From Pixels to Semantics: A Multi-Stage AI Framework for Structural Damage Detection in Satellite Imagery-
[pdf]
[arXiv]
[bibtex]@InProceedings{Shakya_2026_CVPR, author = {Shakya, Bijay and Hoier, Catherine and Ahmed, Khandaker Mamun}, title = {From Pixels to Semantics: A Multi-Stage AI Framework for Structural Damage Detection in Satellite Imagery}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9305-9314} }
HyperRealm: Hyperbolic Vision Language Models for Real-World Hierarchical Multimodal Understanding-
[pdf]
[bibtex]@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {HyperRealm: Hyperbolic Vision Language Models for Real-World Hierarchical Multimodal Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9345-9356} }
Omni-NegCLIP: Enhancing CLIP with Front-Layer Contrastive Fine-Tuning for Comprehensive Negation Understanding-
[pdf]
[arXiv]
[bibtex]@InProceedings{Xu_2026_CVPR, author = {Xu, Jingqi}, title = {Omni-NegCLIP: Enhancing CLIP with Front-Layer Contrastive Fine-Tuning for Comprehensive Negation Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9392-9401} }
RAG4Outcome: A Retrieval-Augmented Multimodal Framework for Prognostic Prediction in Chronic Osteomyelitis-
[pdf]
[arXiv]
[bibtex]@InProceedings{Shi_2026_CVPR, author = {Shi, Daqian and Han, Pei and Chen, Jishizhan and Wang, Yang and Diao, Xiaolei and Zheng, Xianyou and Cheng, Pengfei}, title = {RAG4Outcome: A Retrieval-Augmented Multimodal Framework for Prognostic Prediction in Chronic Osteomyelitis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9325-9334} }
Ultrasound-led stratification of carpal tunnel syndrome reveals structure-function mismatch-
[pdf]
[bibtex]@InProceedings{Chen_2026_CVPR, author = {Chen, Jishizhan and Shi, Daqian and Su, Jiaqi and Huang, Xiaohai and Qian, Yun}, title = {Ultrasound-led stratification of carpal tunnel syndrome reveals structure-function mismatch}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9107-9112} }
Crowdsourcing of Real-world Image Annotation via Visual Properties-
[pdf]
[arXiv]
[bibtex]@InProceedings{Diao_2026_CVPR, author = {Diao, Xiaolei and Giunchiglia, Fausto}, title = {Crowdsourcing of Real-world Image Annotation via Visual Properties}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9125-9134} }
Layout-Aware Representation Learning for Open-Set ID Fraud Discovery-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2026_CVPR, author = {Li, Jinxing and Ren, Nicholas and Chang, Cathy and Pan, Hongkai and George, Daniel}, title = {Layout-Aware Representation Learning for Open-Set ID Fraud Discovery}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9213-9222} }
AOI-SSL: Self-Supervised Framework for Efficient Segmentation of Wire-bonded Semiconductors In Optical Inspection-
[pdf]
[supp]
[bibtex]@InProceedings{Figueira_2026_CVPR, author = {Figueira, Joaqu{\'\i}n and van Gastel, Rob and D'Amicantonio, Giacomo and Liu, Zhuoran and Bucur, Ioan Gabriel and Boughorbel, Faysal and Bondarev, Egor}, title = {AOI-SSL: Self-Supervised Framework for Efficient Segmentation of Wire-bonded Semiconductors In Optical Inspection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9135-9144} }
Zero-Shot Chinese Character Recognition via Global-Local Dual-Branch Alignment and Hierarchical Inference-
[pdf]
[arXiv]
[bibtex]@InProceedings{Cao_2026_CVPR, author = {Cao, Wei and Xu, Hao and Diao, Xiaolei}, title = {Zero-Shot Chinese Character Recognition via Global-Local Dual-Branch Alignment and Hierarchical Inference}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9090-9098} }
Look, Reason, Defuse: Bridging Perception and Domain Knowledge for Real-World Unexploded Ordnance Identification-
[pdf]
[supp]
[bibtex]@InProceedings{Craioveanu_2026_CVPR, author = {Craioveanu, Gheorghe Marian and Stamatescu, Grigore and Saukh, Olga}, title = {Look, Reason, Defuse: Bridging Perception and Domain Knowledge for Real-World Unexploded Ordnance Identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9113-9124} }
WildFireVQA: A Large-Scale Radiometric Thermal VQA Benchmark for Aerial Wildfire Monitoring-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Habibpour_2026_CVPR, author = {Habibpour, Mobin and Talemi, Niloufar Alipour and Spodnik, John and Khoury, Camren J and Afghah, Fatemeh}, title = {WildFireVQA: A Large-Scale Radiometric Thermal VQA Benchmark for Aerial Wildfire Monitoring}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9145-9154} }
Rethinking Data Augmentation for Multi-View Underwater Species Dataset: A Data-Centric Analysis of Feature Space Study-
[pdf]
[bibtex]@InProceedings{Rahman_2026_CVPR, author = {Rahman, Mushfika Sharmin and Hamerly, Greg}, title = {Rethinking Data Augmentation for Multi-View Underwater Species Dataset: A Data-Centric Analysis of Feature Space Study}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9287-9293} }
PathoFusion: Scalable Multimodal Foundations for Integrating Clinical and Preclinical Imaging Analyses in Real-World Pharmaceutical discovery workflows-
[pdf]
[bibtex]@InProceedings{Patra_2026_CVPR, author = {Patra, Arijit and Scordis, Phil}, title = {PathoFusion: Scalable Multimodal Foundations for Integrating Clinical and Preclinical Imaging Analyses in Real-World Pharmaceutical discovery workflows}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9251-9260} }
R3PM-Net: Real-time, Robust, Real-world Point Matching Network-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kashefbahrami_2026_CVPR, author = {Kashefbahrami, Yasaman and Akdag, Erkut and Meletis, Panagiotis and Balmashnova, Evgeniya and Goswami, Dip and Bondarau, Egor}, title = {R3PM-Net: Real-time, Robust, Real-world Point Matching Network}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9174-9183} }
FruitEnsemble: MLLM-Guided Arbitration for Heterogeneous ensemble in Fine-Grained Fruit Recognition-
[pdf]
[arXiv]
[bibtex]@InProceedings{Yu_2026_CVPR, author = {Yu, Enhui and Li, Junhui and Lu, Ruitong and Li, Jialu and Zhang, Youshan}, title = {FruitEnsemble: MLLM-Guided Arbitration for Heterogeneous ensemble in Fine-Grained Fruit Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9421-9430} }
Can LLM-Generated Text Empower Surgical Vision-Language Pre-training?-
[pdf]
[arXiv]
[bibtex]@InProceedings{Che_2026_CVPR, author = {Che, Chengan and Wang, Chao and Huang, Jiayuan and Chen, Xinyue and Garcia-Peraza-Herrera, Luis C.}, title = {Can LLM-Generated Text Empower Surgical Vision-Language Pre-training?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9099-9106} }
Hyperbolic Twins: Efficient Finetuning Vision Language Model for Few-Shot Learning via Hyperbolic Geometry-
[pdf]
[bibtex]@InProceedings{See_2026_CVPR, author = {See, Teng Jiek}, title = {Hyperbolic Twins: Efficient Finetuning Vision Language Model for Few-Shot Learning via Hyperbolic Geometry}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9294-9304} }
STONE: Stable Optimization in Noisy Environments for Robust Vision-Language Models-
[pdf]
[bibtex]@InProceedings{Srivastava_2026_CVPR, author = {Srivastava, Sarthak and Wu, Kathy}, title = {STONE: Stable Optimization in Noisy Environments for Robust Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9362-9372} }
VisText-Mosquito: A Unified Multimodal Dataset for Visual Detection, Segmentation, and Textual Explanation on Mosquito Breeding Sites-
[pdf]
[arXiv]
[bibtex]@InProceedings{Islam_2026_CVPR, author = {Islam, Md. Adnanul and Sayeedi, Md. Faiyaz Abdullah and Shuvo, Md. Asaduzzaman and Bappy, Shahanur Rahman and Rahman, Muhammad Ziaur and Islam, Md Asiful and Shatabda, Swakkhar}, title = {VisText-Mosquito: A Unified Multimodal Dataset for Visual Detection, Segmentation, and Textual Explanation on Mosquito Breeding Sites}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9165-9173} }
HierSum: A Global and Local Attention Mechanism for Video Summarization-
[pdf]
[arXiv]
[bibtex]@InProceedings{Beedu_2026_CVPR, author = {Beedu, Apoorva and Essa, Irfan}, title = {HierSum: A Global and Local Attention Mechanism for Video Summarization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {9069-9079} }

