-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wu_2026_CVPR, author = {Wu, Chenyuan and Wang, Jiahao and Zheng, Pengfei and Yan, Ruiran and Xiao, Shitao and Luo, Xin and Wang, Yueze and Li, Wanli and Jiang, Xiyan and Liu, Yexin and Zhou, Junjie and Xia, Ziyi and Liu, Ze and Li, Chaofan and Deng, Haoge and Luo, Kun and Zhang, Bo and Zhang, Jiajun and Liu, Dong and Lian, Defu and Wang, Xinlong and Wang, Zhongyuan and Huang, Tiejun and Liu, Zheng}, title = {OmniGen2: Towards Instruction-Aligned Multimodal Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2026}, pages = {21964-21975} }
OmniGen2: Towards Instruction-Aligned Multimodal Generation
Abstract
Multimodal generative models can process instructions in various modalities and demonstrate outstanding performance across a wide range of image generation tasks. However, their robustness in complex real-world scenarios remains limited due to insufficient generalized instruction alignment. We introduce OmniGen2, a unified multimodal generator designed to follow complex, fine-grained instructions. Our core contribution is a two-stage design that first builds a strong, world-knowledge-grounded foundation model and then aligns it using a progressive, multi-task instruction tuning strategy. The foundation model features a streamlined architecture with decoupled decoding for versatile multimodal generation and a novel positional encoding scheme to improve learning efficiency. We ground this model in real-world knowledge using large-scale data construction pipelines. Building on this foundation, we propose a progressive, reinforcement-based alignment process. This phase carefully schedules training tasks and reward signals to foster cross-task knowledge transfer, significantly improving the model's instruction-following capabilities. Our models demonstrate competitive performance on standard benchmarks and our dedicated in-context generation benchmark, OmniContext. We have released our models, code, benchmark, and training datasets at https://github.com/VectorSpaceLab/OmniGen2.
Related Material

