CVPR 2026 Open Access Repository

The 2nd Workshop on Test-time Scaling for Computer Vision

EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents: Zhili Cheng,

Ran Li,

Jinyi Hu,

Yuge Tu,

Shiqi Dai,

Shengding Hu,

Yang Shi,

Lei Shi,

Maosong Sun; [pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Cheng_2026_CVPR, author = {Cheng, Zhili and Li, Ran and Hu, Jinyi and Tu, Yuge and Dai, Shiqi and Hu, Shengding and Shi, Yang and Shi, Lei and Sun, Maosong}, title = {EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11420-11432} }
SA-TTS: Stress-Aware Test-Time Scaling for Vision Models: Youla Yang; [pdf]
[bibtex]
@InProceedings{Yang_2026_CVPR, author = {Yang, Youla}, title = {SA-TTS: Stress-Aware Test-Time Scaling for Vision Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11480-11487} }
MetaWorld: Skill Transfer and Composition in a Hierarchical World Model for Grounding High-Level Instructions: Yutong Shen,

Hangxu Liu,

Kailin Pei,

Ruizhe Xia,

Tongtong Feng; [pdf] [arXiv]
[bibtex]
@InProceedings{Shen_2026_CVPR, author = {Shen, Yutong and Liu, Hangxu and Pei, Kailin and Xia, Ruizhe and Feng, Tongtong}, title = {MetaWorld: Skill Transfer and Composition in a Hierarchical World Model for Grounding High-Level Instructions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11470-11479} }
TreeReasoner: Reinforcing Tool-Augmented Tree-of-Videos Reasoning: Hongcheng Gao,

Jingyi Tang,

Zihao Huang,

Liang Li,

Li Su,

Qingming Huang; [pdf] [supp]
[bibtex]
@InProceedings{Gao_2026_CVPR, author = {Gao, Hongcheng and Tang, Jingyi and Huang, Zihao and Li, Liang and Su, Li and Huang, Qingming}, title = {TreeReasoner: Reinforcing Tool-Augmented Tree-of-Videos Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11457-11469} }
Understanding the Limits of Vision Test-Time Scaling: Path Redundancy, Instance Difficulty, and Adaptive Compute: Youla Yang; [pdf]
[bibtex]
@InProceedings{Yang_2026_CVPR, author = {Yang, Youla}, title = {Understanding the Limits of Vision Test-Time Scaling: Path Redundancy, Instance Difficulty, and Adaptive Compute}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11450-11456} }
Rethinking Dense Optical Flow without Test-Time Scaling: Praroop Chanda,

Suryansh Kumar; [pdf] [arXiv]
[bibtex]
@InProceedings{Chanda_2026_CVPR, author = {Chanda, Praroop and Kumar, Suryansh}, title = {Rethinking Dense Optical Flow without Test-Time Scaling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11410-11419} }
Mind over Space: Can Multimodal Large Language Models Mentally Navigate?: Qihui Zhu,

Shouwei Ruan,

Xiao Yang,

Hao Jiang,

Yao Huang,

Shiji Zhao,

Hanwei Fan,

Hang Su,

Xingxing Wei; [pdf] [arXiv]
[bibtex]
@InProceedings{Zhu_2026_CVPR, author = {Zhu, Qihui and Ruan, Shouwei and Yang, Xiao and Jiang, Hao and Huang, Yao and Zhao, Shiji and Fan, Hanwei and Su, Hang and Wei, Xingxing}, title = {Mind over Space: Can Multimodal Large Language Models Mentally Navigate?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11488-11497} }
IMA & TMA: Efficient Test-Time Adaptation for VLMs via Linear Transformation in Embedding Space: Rishik Vamshi Rohith Vempati,

Eswar Venkata Sai Kadava,

Konda Reddy Mopuri; [pdf] [supp]
[bibtex]
@InProceedings{Vempati_2026_CVPR, author = {Vempati, Rishik Vamshi Rohith and Kadava, Eswar Venkata Sai and Mopuri, Konda Reddy}, title = {IMA \& TMA: Efficient Test-Time Adaptation for VLMs via Linear Transformation in Embedding Space}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11439-11449} }
ProFuse: Efficient Open-Vocabulary 3D Gaussian Splatting with Early-Saturating Semantic Uplifting: Yen-Jen Chiou; [pdf]
[bibtex]
@InProceedings{Chiou_2026_CVPR, author = {Chiou, Yen-Jen}, title = {ProFuse: Efficient Open-Vocabulary 3D Gaussian Splatting with Early-Saturating Semantic Uplifting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11433-11438} }