9th Multimodal Learning and Applications Workshop


VidText: Towards Comprehensive Evaluation for Video Text Understanding
Zhoufaran Yang,
Yan Shu,
Jing Wang,
Zhifei Yang,
Yan Zhang,
Huaying Yuan,
Yu Li,
Keyang Lu,
Gangyan Zeng,
Shaohui Liu,
Yu Zhou,
Nicu Sebe
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Yang_2026_CVPR, author = {Yang, Zhoufaran and Shu, Yan and Wang, Jing and Yang, Zhifei and Zhang, Yan and Yuan, Huaying and Li, Yu and Lu, Keyang and Zeng, Gangyan and Liu, Shaohui and Zhou, Yu and Sebe, Nicu}, title = {VidText: Towards Comprehensive Evaluation for Video Text Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2026}, pages = {7575-7585} }