-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2026_CVPR, author = {Zhang, Baoheng and Liu, Jiahui and Zhao, Gui and Zhang, Weizhou and Ma, Yixuan and Jiang, Jun and Chen, Yingxian and Fok, Wilton W.T. and Qi, Xiaojuan and So, Hayden Kwok-Hay}, title = {Learning to See through Illumination Extremes with Event Streaming in Multimodal Large Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2026}, pages = {26198-26208} }
Learning to See through Illumination Extremes with Event Streaming in Multimodal Large Language Models
Abstract
Multimodal Large Language Models (MLLMs) perform strong vision-language reasoning under standard conditions but fail in extreme illumination, where RGB inputs lose irrevocable structure and semantics. We propose Event-MLLM, an event-enhanced model that performs all-light visual reasoning by dynamically fusing event streams with RGB frames. Two key components drive our approach: an Illumination Indicator -- a learnable signal derived from a DINOv2 branch that represents exposure degradation and adaptively modulates event-RGB fusion -- and an Illumination Correction Loss that aligns fused features with non-degraded (normal-light) semantics in the latent space, compensating for information lost in extreme lighting. We curate the first multi-illumination event-instruction corpus for MLLMs, with 2,241 event-RGB samples (around 6 QA pairs each) across diverse scenes and 17 brightness rates (0.05x - 20x), plus an instruct-following benchmark for reasoning, counting, and fine-grained recognition under extreme lighting. Experiments show that Event-MLLM markedly outperforms general-purpose, illumination-adaptive, and event-only baselines, setting a new state of the art in robust multimodal perception and reasoning under challenging illumination.
Related Material

