@Article{info:doi/10.2196/63631,
author="Kim, Kwanho
and Kim, Soojong",
title="Large Language Models' Accuracy in Emulating Human Experts' Evaluation of Public Sentiments about Heated Tobacco Products on Social Media: Evaluation Study",
journal="J Med Internet Res",
year="2025",
month="Mar",
day="4",
volume="27",
pages="e63631",
keywords="heated tobacco products; artificial intelligence; large language models; social media; sentiment analysis; ChatGPT; generative pre-trained transformer; GPT; LLM; NLP; natural language processing; machine learning; language model; sentiment; evaluation; tobacco; alternative; prevention; nicotine; OpenAI",
abstract="Background: Sentiment analysis of alternative tobacco products discussed on social media is crucial in tobacco control research. Large language models (LLMs) are artificial intelligence models that were trained on extensive text data to emulate the linguistic patterns of humans. LLMs may hold the potential to streamline the time-consuming and labor-intensive process of human sentiment analysis. Objective: This study aimed to examine the accuracy of LLMs in replicating human sentiment evaluation of social media messages relevant to heated tobacco products (HTPs). Methods: GPT-3.5 and GPT-4 Turbo (OpenAI) were used to classify 500 Facebook (Meta Platforms) and 500 Twitter (subsequently rebranded X) messages. Each set consisted of 200 human-labeled anti-HTPs, 200 pro-HTPs, and 100 neutral messages. The models evaluated each message up to 20 times to generate multiple response instances reporting its classification decisions. The majority of the labels from these responses were assigned as a model's decision for the message. The models' classification decisions were then compared with those of human evaluators. Results: GPT-3.5 accurately replicated human sentiment evaluation in 61.2{\%} of Facebook messages and 57{\%} of Twitter messages. GPT-4 Turbo demonstrated higher accuracies overall, with 81.7{\%} for Facebook messages and 77{\%} for Twitter messages. GPT-4 Turbo's accuracy with 3 response instances reached 99{\%} of the accuracy achieved with 20 response instances. GPT-4 Turbo's accuracy was higher for human-labeled anti- and pro-HTP messages compared with neutral messages. Most of the GPT-3.5 misclassifications occurred when anti- or pro-HTP messages were incorrectly classified as neutral or irrelevant by the model, whereas GPT-4 Turbo showed improvements across all sentiment categories and reduced misclassifications, especially in incorrectly categorized messages as irrelevant. Conclusions: LLMs can be used to analyze sentiment in social media messages about HTPs. Results from GPT-4 Turbo suggest that accuracy can reach approximately 80{\%} compared with the results of human experts, even with a small number of labeling decisions generated by the model. A potential risk of using LLMs is the misrepresentation of the overall sentiment due to the differences in accuracy across sentiment categories. Although this issue could be reduced with the newer language model, future efforts should explore the mechanisms underlying the discrepancies and how to address them systematically. ",
issn="1438-8871",
doi="10.2196/63631",
url="https://www.jmir.org/2025/1/e63631",
url="https://doi.org/10.2196/63631",
url="http://www.ncbi.nlm.nih.gov/pubmed/40053746"
}