-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2026_CVPR, author = {Wang, Linqing and Xu, Zhiyong and Xing, Ximing and Cheng, Yiji and Zhao, Zhiyuan and Li, Donghao and Hang, Tiankai and Li, Zhenxi and Tao, Jiale and Wang, Qixun and Li, Ruihuang and Chen, Comi and Li, Xin and Wu, Mingrui and Deng, Xinchi and Gu, Shuyang and Wang, Chunyu and Lu, Qinglin}, title = {PromptEnhancer: Taming Your Rewriter for Text-to-Image Generation via Fine-Grained Reward}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2026}, pages = {14895-14904} }
PromptEnhancer: Taming Your Rewriter for Text-to-Image Generation via Fine-Grained Reward
Abstract
Recent text-to-image (T2I) diffusion models have achieved impressive progress in generating high-fidelity images, yet they often fail to faithfully follow complex user prompts, especially in attribute binding, negation, and compositional reasoning. To address this limitation, we propose PromptEnhancer, a universal prompt rewriting framework that improves prompt interpretability for any pre-trained T2I model. PromptEnhancer is trained through a multi-stage pipeline. We first perform supervised fine-tuning on chain-of-thought-style rewriting data to endow the rewriter with structured prompt analysis and rewriting capabilities. We then introduce AlignEvaluator, a task-specific reward model that provides explicit and fine-grained feedback based on a taxonomy of common T2I failure modes, and further optimize the rewriter with reinforcement learning. This design enables the rewriter to produce prompts that are more precise, semantically complete, and easier for T2I models to follow. To support evaluation, we also construct a comprehensive human-aligned benchmark covering diverse semantic and compositional challenges. Extensive experiments show that PromptEnhancer consistently improves image-text alignment across multiple T2I models and significantly enhances prompt fidelity on challenging cases.
Related Material

