@Article{info:doi/10.2196/67488,
author="Dai, Zhang-Yi
and Wang, Fu-Qiang
and Shen, Cheng
and Ji, Yan-Li
and Li, Zhi-Yang
and Wang, Yun
and Pu, Qiang",
title="Accuracy of Large Language Models for Literature Screening in Thoracic Surgery: Diagnostic Study",
journal="J Med Internet Res",
year="2025",
month="Mar",
day="11",
volume="27",
pages="e67488",
keywords="accuracy; large language models; meta-analysis; literature screening; thoracic surgery",
abstract="Background: Systematic reviews and meta-analyses rely on labor-intensive literature screening. While machine learning offers potential automation, its accuracy remains suboptimal. This raises the question of whether emerging large language models (LLMs) can provide a more accurate and efficient approach. Objective: This paper evaluates the sensitivity, specificity, and summary receiver operating characteristic (SROC) curve of LLM-assisted literature screening. Methods: We conducted a diagnostic study comparing the accuracy of LLM-assisted screening versus manual literature screening across 6 thoracic surgery meta-analyses. Manual screening by 2 investigators served as the reference standard. LLM-assisted screening was performed using ChatGPT-4o (OpenAI) and Claude-3.5 (Anthropic) sonnet, with discrepancies resolved by Gemini-1.5 pro (Google). In addition, 2 open-source, machine learning--based screening tools, ASReview (Utrecht University) and Abstrackr (Center for Evidence Synthesis in Health, Brown University School of Public Health), were also evaluated. We calculated sensitivity, specificity, and 95{\%} CIs for the title and abstract, as well as full-text screening, generating pooled estimates and SROC curves. LLM prompts were revised based on a post hoc error analysis. Results: LLM-assisted full-text screening demonstrated high pooled sensitivity (0.87, 95{\%} CI 0.77-0.99) and specificity (0.96, 95{\%} CI 0.91-0.98), with the area under the curve (AUC) of 0.96 (95{\%} CI 0.94-0.97). Title and abstract screening achieved a pooled sensitivity of 0.73 (95{\%} CI 0.57-0.85) and specificity of 0.99 (95{\%} CI 0.97-0.99), with an AUC of 0.97 (95{\%} CI 0.96-0.99). Post hoc revisions improved sensitivity to 0.98 (95{\%} CI 0.74-1.00) while maintaining high specificity (0.98, 95{\%} CI 0.94-0.99). In comparison, the pooled sensitivity and specificity of ASReview tool-assisted screening were 0.58 (95{\%} CI 0.53-0.64) and 0.97 (95{\%} CI 0.91-0.99), respectively, with an AUC of 0.66 (95{\%} CI 0.62-0.70). The pooled sensitivity and specificity of Abstrackr tool-assisted screening were 0.48 (95{\%} CI 0.35-0.62) and 0.96 (95{\%} CI 0.88-0.99), respectively, with an AUC of 0.78 (95{\%} CI 0.74-0.82). A post hoc meta-analysis revealed comparable effect sizes between LLM-assisted and conventional screening. Conclusions: LLMs hold significant potential for streamlining literature screening in systematic reviews, reducing workload without sacrificing quality. Importantly, LLMs outperformed traditional machine learning-based tools (ASReview and Abstrackr) in both sensitivity and AUC values, suggesting that LLMs offer a more accurate and efficient approach to literature screening. ",
issn="1438-8871",
doi="10.2196/67488",
url="https://www.jmir.org/2025/1/e67488",
url="https://doi.org/10.2196/67488"
}