-
[pdf]
[arXiv]
[bibtex]@InProceedings{Chen_2025_ICCV, author = {Chen, Kexin and Du, Yuyang and Li, Junyou and Cao, Hanqun and Guo, Menghao and Dang, Xilin and Li, Lanqing and Qiu, Jiezhong and Chen, Guangyong and Heng, Pheng Ann}, title = {ChemMiner: A Large Language Model Agent System for Chemical Literature Data Mining}, booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops}, month = {October}, year = {2025}, pages = {7595-7603} }
ChemMiner: A Large Language Model Agent System for Chemical Literature Data Mining
Abstract
The development of AI-assisted chemical synthesis tools requires comprehensive datasets covering diverse reaction types, yet current high-throughput experimental (HTE) approaches are expensive and limited in scope. Chemical literature represents a vast, underexplored data source containing thousands of reactions published annually. However, extracting reaction information from literature faces significant challenges including varied writing styles, complex coreference relationships, and multimodal information presentation. This paper proposes ChemMiner, a novel end-to-end framework leveraging multiple agents powered by large language models (LLMs) to extract high-fidelity chemical data from literature. ChemMiner incorporates three specialized agents: a text analysis agent for coreference mapping, a multimodal agent for non-textual information extraction, and a synthesis analysis agent for data generation. Furthermore, we developed a comprehensive benchmark with expert-annotated chemical literature to evaluate both extraction efficiency and precision. Experimental results demonstrate reaction identification rates comparable to human chemists while significantly reducing processing time, with high accuracy, recall, and F1 scores. Our open-sourced benchmark facilitates future research in chemical literature data mining.
Related Material
