Expanding Horizons in AI Benchmarking: Multimodal Approaches


KOFFVQA: An Objectively Evaluated Free-form VQA Benchmark for Large Vision-Language Models in the Korean Language
Yoonshik Kim,
Jaeyoon Jung
[pdf] [arXiv]
[bibtex]
@InProceedings{Kim_2025_CVPR, author = {Kim, Yoonshik and Jung, Jaeyoon}, title = {KOFFVQA: An Objectively Evaluated Free-form VQA Benchmark for Large Vision-Language Models in the Korean Language}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {575-585} }

Behind the Magic, MERLIM: Multi-modal Evaluation Benchmark for Large Image-Language Models
Andrés Villa,
Juan Léon,
Alvaro Soto,
Bernard Ghanem
[pdf] [supp]
[bibtex]
@InProceedings{Villa_2025_CVPR, author = {Villa, Andr\'es and L\'eon, Juan and Soto, Alvaro and Ghanem, Bernard}, title = {Behind the Magic, MERLIM: Multi-modal Evaluation Benchmark for Large Image-Language Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {492-502} }

Beyond Raw Videos: Understanding Edited Videos with Large Multimodal Model
Lu Xu,
Sijie Zhu,
Chunyuan Li,
Chia-Wen Kuo,
Fan Chen,
Xinyao Wang,
Guang Chen,
Dawei Du,
Ye Yuan,
Longyin Wen
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Xu_2025_CVPR, author = {Xu, Lu and Zhu, Sijie and Li, Chunyuan and Kuo, Chia-Wen and Chen, Fan and Wang, Xinyao and Chen, Guang and Du, Dawei and Yuan, Ye and Wen, Longyin}, title = {Beyond Raw Videos: Understanding Edited Videos with Large Multimodal Model}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {503-512} }

Revisiting Multi-Modal LLM Evaluation
Jian Lu,
Shikhar Srivastava,
Junyu Chen,
Robik Shrestha,
Manoj Acharya,
Kushal Kafle,
Christopher Kanan
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Lu_2025_CVPR, author = {Lu, Jian and Srivastava, Shikhar and Chen, Junyu and Shrestha, Robik and Acharya, Manoj and Kafle, Kushal and Kanan, Christopher}, title = {Revisiting Multi-Modal LLM Evaluation}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {555-564} }

Revisiting Referring Expression Comprehension Evaluation in the Era of Large Multimodal Models
Jierun Chen,
Fangyun Wei,
Jinjing Zhao,
Sizhe Song,
Bohuai Wu,
Zhuoxuan Peng,
S.-H. Gary Chan,
Hongyang Zhang
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Chen_2025_CVPR, author = {Chen, Jierun and Wei, Fangyun and Zhao, Jinjing and Song, Sizhe and Wu, Bohuai and Peng, Zhuoxuan and Chan, S.-H. Gary and Zhang, Hongyang}, title = {Revisiting Referring Expression Comprehension Evaluation in the Era of Large Multimodal Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {513-524} }

Choosing `Right' from Wrong: A Closer Look at Selection Bias in Spatial Multiple-Choice Questions in Large Multimodal Models
Giselle Zeno,
Nour Jedidi,
Steven Gomez
[pdf] [supp]
[bibtex]
@InProceedings{Zeno_2025_CVPR, author = {Zeno, Giselle and Jedidi, Nour and Gomez, Steven}, title = {Choosing `Right' from Wrong: A Closer Look at Selection Bias in Spatial Multiple-Choice Questions in Large Multimodal Models}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {535-544} }

Quantum Federated Learning for Multimodal Data: A Modality-Agnostic Approach
Atit Pokharel,
Ratun Rahman,
Thomas Morris,
Dinh C. Nguyen
[pdf]
[bibtex]
@InProceedings{Pokharel_2025_CVPR, author = {Pokharel, Atit and Rahman, Ratun and Morris, Thomas and Nguyen, Dinh C.}, title = {Quantum Federated Learning for Multimodal Data: A Modality-Agnostic Approach}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {545-554} }

MerCulture: A Comprehensive Benchmark to Evaluate Vision-Language Models on Cultural Understanding in Singapore
Pranav Tushar,
Eshan Pandey,
Lyka Diane Bala Austria,
Yin Yin Loo,
Jing Hao Lim,
Indriyati Atmosukarto,
Donny Soh Cheng Lock
[pdf]
[bibtex]
@InProceedings{Tushar_2025_CVPR, author = {Tushar, Pranav and Pandey, Eshan and Austria, Lyka Diane Bala and Loo, Yin Yin and Lim, Jing Hao and Atmosukarto, Indriyati and Lock, Donny Soh Cheng}, title = {MerCulture: A Comprehensive Benchmark to Evaluate Vision-Language Models on Cultural Understanding in Singapore}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {565-574} }

TextInVision: Text and Prompt Complexity Driven Visual Text Generation Benchmark
Forouzan Fallah,
Maitreya Patel,
Agneet Chatterjee,
Vlad Morariu,
Chitta Baral,
Yezhou Yang
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Fallah_2025_CVPR, author = {Fallah, Forouzan and Patel, Maitreya and Chatterjee, Agneet and Morariu, Vlad and Baral, Chitta and Yang, Yezhou}, title = {TextInVision: Text and Prompt Complexity Driven Visual Text Generation Benchmark}, booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR) Workshops}, month = {June}, year = {2025}, pages = {525-534} }