@Article{info:doi/10.2196/jmir.6533,
author="Gibbons, Chris
and Richards, Suzanne
and Valderas, Jose Maria
and Campbell, John",
title="Supervised Machine Learning Algorithms Can Classify Open-Text Feedback of Doctor Performance With Human-Level Accuracy",
journal="J Med Internet Res",
year="2017",
month="Mar",
day="15",
volume="19",
number="3",
pages="e65",
keywords="machine learning; surveys and questionnaires; feedback; data mining; work performance",
abstract="Background: Machine learning techniques may be an effective and efficient way to classify open-text reports on doctor's activity for the purposes of quality assurance, safety, and continuing professional development. Objective: The objective of the study was to evaluate the accuracy of machine learning algorithms trained to classify open-text reports of doctor performance and to assess the potential for classifications to identify significant differences in doctors' professional performance in the United Kingdom. Methods: We used 1636 open-text comments (34,283 words) relating to the performance of 548 doctors collected from a survey of clinicians' colleagues using the General Medical Council Colleague Questionnaire (GMC-CQ). We coded 77.75{\%} (1272/1636) of the comments into 5 global themes (innovation, interpersonal skills, popularity, professionalism, and respect) using a qualitative framework. We trained 8 machine learning algorithms to classify comments and assessed their performance using several training samples. We evaluated doctor performance using the GMC-CQ and compared scores between doctors with different classifications using t tests. Results: Individual algorithm performance was high (range F score=.68 to .83). Interrater agreement between the algorithms and the human coder was highest for codes relating to ``popular'' (recall=.97), ``innovator'' (recall=.98), and ``respected'' (recall=.87) codes and was lower for the ``interpersonal'' (recall=.80) and ``professional'' (recall=.82) codes. A 10-fold cross-validation demonstrated similar performance in each analysis. When combined together into an ensemble of multiple algorithms, mean human-computer interrater agreement was .88. Comments that were classified as ``respected,'' ``professional,'' and ``interpersonal'' related to higher doctor scores on the GMC-CQ compared with comments that were not classified (P<.05). Scores did not vary between doctors who were rated as popular or innovative and those who were not rated at all (P>.05). Conclusions: Machine learning algorithms can classify open-text feedback of doctor performance into multiple themes derived by human raters with high performance. Colleague open-text comments that signal respect, professionalism, and being interpersonal may be key indicators of doctor's performance. ",
issn="1438-8871",
doi="10.2196/jmir.6533",
url="http://www.jmir.org/2017/3/e65/",
url="https://doi.org/10.2196/jmir.6533",
url="http://www.ncbi.nlm.nih.gov/pubmed/28298265"
}