The 2nd Workshop on Test-time Scaling for Computer Vision


EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents
Zhili Cheng,
Ran Li,
Jinyi Hu,
Yuge Tu,
Shiqi Dai,
Shengding Hu,
Yang Shi,
Lei Shi,
Maosong Sun
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Cheng_2026_CVPR, author = {Cheng, Zhili and Li, Ran and Hu, Jinyi and Tu, Yuge and Dai, Shiqi and Hu, Shengding and Shi, Yang and Shi, Lei and Sun, Maosong}, title = {EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11420-11432} }

SA-TTS: Stress-Aware Test-Time Scaling for Vision Models
Youla Yang
[pdf]
[bibtex]
@InProceedings{Yang_2026_CVPR, author = {Yang, Youla}, title = {SA-TTS: Stress-Aware Test-Time Scaling for Vision Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11480-11487} }

MetaWorld: Skill Transfer and Composition in a Hierarchical World Model for Grounding High-Level Instructions
Yutong Shen,
Hangxu Liu,
Kailin Pei,
Ruizhe Xia,
Tongtong Feng
[pdf] [arXiv]
[bibtex]
@InProceedings{Shen_2026_CVPR, author = {Shen, Yutong and Liu, Hangxu and Pei, Kailin and Xia, Ruizhe and Feng, Tongtong}, title = {MetaWorld: Skill Transfer and Composition in a Hierarchical World Model for Grounding High-Level Instructions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11470-11479} }

TreeReasoner: Reinforcing Tool-Augmented Tree-of-Videos Reasoning
Hongcheng Gao,
Jingyi Tang,
Zihao Huang,
Liang Li,
Li Su,
Qingming Huang
[pdf] [supp]
[bibtex]
@InProceedings{Gao_2026_CVPR, author = {Gao, Hongcheng and Tang, Jingyi and Huang, Zihao and Li, Liang and Su, Li and Huang, Qingming}, title = {TreeReasoner: Reinforcing Tool-Augmented Tree-of-Videos Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11457-11469} }

Understanding the Limits of Vision Test-Time Scaling: Path Redundancy, Instance Difficulty, and Adaptive Compute
Youla Yang
[pdf]
[bibtex]
@InProceedings{Yang_2026_CVPR, author = {Yang, Youla}, title = {Understanding the Limits of Vision Test-Time Scaling: Path Redundancy, Instance Difficulty, and Adaptive Compute}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11450-11456} }

Rethinking Dense Optical Flow without Test-Time Scaling
Praroop Chanda,
Suryansh Kumar
[pdf] [arXiv]
[bibtex]
@InProceedings{Chanda_2026_CVPR, author = {Chanda, Praroop and Kumar, Suryansh}, title = {Rethinking Dense Optical Flow without Test-Time Scaling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11410-11419} }

Mind over Space: Can Multimodal Large Language Models Mentally Navigate?
Qihui Zhu,
Shouwei Ruan,
Xiao Yang,
Hao Jiang,
Yao Huang,
Shiji Zhao,
Hanwei Fan,
Hang Su,
Xingxing Wei
[pdf] [arXiv]
[bibtex]
@InProceedings{Zhu_2026_CVPR, author = {Zhu, Qihui and Ruan, Shouwei and Yang, Xiao and Jiang, Hao and Huang, Yao and Zhao, Shiji and Fan, Hanwei and Su, Hang and Wei, Xingxing}, title = {Mind over Space: Can Multimodal Large Language Models Mentally Navigate?}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11488-11497} }

IMA & TMA: Efficient Test-Time Adaptation for VLMs via Linear Transformation in Embedding Space
Rishik Vamshi Rohith Vempati,
Eswar Venkata Sai Kadava,
Konda Reddy Mopuri
[pdf] [supp]
[bibtex]
@InProceedings{Vempati_2026_CVPR, author = {Vempati, Rishik Vamshi Rohith and Kadava, Eswar Venkata Sai and Mopuri, Konda Reddy}, title = {IMA \& TMA: Efficient Test-Time Adaptation for VLMs via Linear Transformation in Embedding Space}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11439-11449} }

ProFuse: Efficient Open-Vocabulary 3D Gaussian Splatting with Early-Saturating Semantic Uplifting
Yen-Jen Chiou
[pdf]
[bibtex]
@InProceedings{Chiou_2026_CVPR, author = {Chiou, Yen-Jen}, title = {ProFuse: Efficient Open-Vocabulary 3D Gaussian Splatting with Early-Saturating Semantic Uplifting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {11433-11438} }