Pixel-level Video Understanding in the Wild Challenge


Geometry-Guided Camera Motion Understanding in VideoLLMs
Haoan Feng,
Sri Harsha Musunuri,
Guan-Ming Su
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Feng_2026_CVPR, author = {Feng, Haoan and Musunuri, Sri Harsha and Su, Guan-Ming}, title = {Geometry-Guided Camera Motion Understanding in VideoLLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8423-8433} }

VISTA: A Benchmark for Spatio-Temporal Interaction Understanding in Videos
Alejandro Aparcedo,
Akash Kumar,
Aaryan Garg,
Dalton Pham,
Wen-Kai Chen,
Anirudh Bharadwaj,
Aman Chadha,
Yogesh Rawat
[pdf] [supp]
[bibtex]
@InProceedings{Aparcedo_2026_CVPR, author = {Aparcedo, Alejandro and Kumar, Akash and Garg, Aaryan and Pham, Dalton and Chen, Wen-Kai and Bharadwaj, Anirudh and Chadha, Aman and Rawat, Yogesh}, title = {VISTA: A Benchmark for Spatio-Temporal Interaction Understanding in Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8411-8422} }

A Hyperbolic Perspective on Hierarchical Structure in Object-Centric Scene Representations
Neelu Madan,
Alex Pujol Vidal,
Andreas Mogelmose,
Sergio Escalera,
Kamal Nasrollahi,
Graham W. Taylor,
Thomas B. Moeslund
[pdf]
[bibtex]
@InProceedings{Madan_2026_CVPR, author = {Madan, Neelu and Vidal, Alex Pujol and Mogelmose, Andreas and Escalera, Sergio and Nasrollahi, Kamal and Taylor, Graham W. and Moeslund, Thomas B.}, title = {A Hyperbolic Perspective on Hierarchical Structure in Object-Centric Scene Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8476-8485} }

VVitCutLER:Towards Unsupervised Object Detection and Segmentation in Videos
Zhijing Lu,
Khurram Azeem Hashmi,
Didier Stricker,
Muhammad Zeshan Afzal
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lu_2026_CVPR, author = {Lu, Zhijing and Hashmi, Khurram Azeem and Stricker, Didier and Afzal, Muhammad Zeshan}, title = {VVitCutLER:Towards Unsupervised Object Detection and Segmentation in Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8466-8475} }

Phrase-Instance Alignment for Generalized Referring Segmentation
E-Ro Nguyen,
Hieu Le,
Dimitris Samaras,
Michael S. Ryoo
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Nguyen_2026_CVPR, author = {Nguyen, E-Ro and Le, Hieu and Samaras, Dimitris and Ryoo, Michael S.}, title = {Phrase-Instance Alignment for Generalized Referring Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8486-8495} }

Report of the 5th PVUW Challenge: Towards More Diverse Modalities in Pixel-Level Understanding
Chang Liu,
Henghui Ding,
Nikhila Ravi,
Yunchao Wei,
Shuting He,
Song Bai,
Philip Torr,
Leilei Cao,
Jinrong Zhang,
Deshui Miao,
Xusheng He,
Dengxian Gong,
Zhiyu Wang,
Mingqi Gao,
Jihwan Hong
[pdf] [arXiv]
[bibtex]
@InProceedings{Liu_2026_CVPR, author = {Liu, Chang and Ding, Henghui and Ravi, Nikhila and Wei, Yunchao and He, Shuting and Bai, Song and Torr, Philip and Cao, Leilei and Zhang, Jinrong and Miao, Deshui and He, Xusheng and Gong, Dengxian and Wang, Zhiyu and Gao, Mingqi and Hong, Jihwan}, title = {Report of the 5th PVUW Challenge: Towards More Diverse Modalities in Pixel-Level Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8456-8465} }

A Paradigm Shift: Fully End-to-End Training for Temporal Sentence Grounding in Videos
Allen He,
Qi Liu,
Kun Liu,
Xinchen Liu,
Wu Liu
[pdf] [arXiv]
[bibtex]
@InProceedings{He_2026_CVPR, author = {He, Allen and Liu, Qi and Liu, Kun and Liu, Xinchen and Liu, Wu}, title = {A Paradigm Shift: Fully End-to-End Training for Temporal Sentence Grounding in Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8434-8444} }

Pixel-level Scene Understanding in One Token: Visual States Need What-is-Where Composition
Seokmin Lee,
Yunghee Lee,
Byeonghyun Pak,
Byeongju Woo
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lee_2026_CVPR, author = {Lee, Seokmin and Lee, Yunghee and Pak, Byeonghyun and Woo, Byeongju}, title = {Pixel-level Scene Understanding in One Token: Visual States Need What-is-Where Composition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8445-8455} }

Semantic Alignment in Hyperbolic Space for Open-Vocabulary Semantic Segmentation
Hoang M. Truong,
Hai Nguyen-Truong,
Dang Huynh
[pdf]
[bibtex]
@InProceedings{Truong_2026_CVPR, author = {Truong, Hoang M. and Nguyen-Truong, Hai and Huynh, Dang}, title = {Semantic Alignment in Hyperbolic Space for Open-Vocabulary Semantic Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8496-8505} }

De-Supervision in Camouflaged Videos
Luca Alessandrini,
Antonino Maria Rizzo,
Luca Magri,
Giacomo Boracchi,
Federica Arrigoni
[pdf] [supp]
[bibtex]
@InProceedings{Alessandrini_2026_CVPR, author = {Alessandrini, Luca and Rizzo, Antonino Maria and Magri, Luca and Boracchi, Giacomo and Arrigoni, Federica}, title = {De-Supervision in Camouflaged Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {8401-8410} }