The use of ChatGPT and similar Large Language Model (LLM) tools in scholarly communication and academic publishing has been widely discussed since they became easily accessible to a general audience in late 2022. This study uses keywords known to be disproportionately present in LLM-generated text to provide an overall estimate for the prevalence of LLM-assisted writing in the scholarly literature. For the publishing year 2023, it is found that several of those keywords show a distinctive and disproportionate increase in their prevalence, individually and in combination. It is estimated that at least 60,000 papers (slightly over 1% of all articles) were LLM-assisted, though this number could be extended and refined by analysis of other characteristics of the papers or by identification of further indicative keywords.
%0 Generic
%1 gray2024chatgpt
%A Gray, Andrew
%D 2024
%K ChatGPT academic_publishing artificial_intelligence large_language_models scholarly_literature
%R 10.48550/arXiv.2403.16887
%T ChatGPT "contamination": estimating the prevalence of LLMs in the scholarly literature
%U https://arxiv.org/abs/2403.16887
%X The use of ChatGPT and similar Large Language Model (LLM) tools in scholarly communication and academic publishing has been widely discussed since they became easily accessible to a general audience in late 2022. This study uses keywords known to be disproportionately present in LLM-generated text to provide an overall estimate for the prevalence of LLM-assisted writing in the scholarly literature. For the publishing year 2023, it is found that several of those keywords show a distinctive and disproportionate increase in their prevalence, individually and in combination. It is estimated that at least 60,000 papers (slightly over 1% of all articles) were LLM-assisted, though this number could be extended and refined by analysis of other characteristics of the papers or by identification of further indicative keywords.
@misc{gray2024chatgpt,
abstract = {The use of ChatGPT and similar Large Language Model (LLM) tools in scholarly communication and academic publishing has been widely discussed since they became easily accessible to a general audience in late 2022. This study uses keywords known to be disproportionately present in LLM-generated text to provide an overall estimate for the prevalence of LLM-assisted writing in the scholarly literature. For the publishing year 2023, it is found that several of those keywords show a distinctive and disproportionate increase in their prevalence, individually and in combination. It is estimated that at least 60,000 papers (slightly over 1% of all articles) were LLM-assisted, though this number could be extended and refined by analysis of other characteristics of the papers or by identification of further indicative keywords. },
added-at = {2024-03-26T15:49:35.000+0100},
archiveprefix = {arXiv},
author = {Gray, Andrew},
biburl = {https://www.bibsonomy.org/bibtex/2174296b0b70bc4239b5cf485c7d8386d/meneteqel},
doi = {10.48550/arXiv.2403.16887},
eprint = {2403.16887},
interhash = {2046d37457a2646b3322198bb6245d3e},
intrahash = {174296b0b70bc4239b5cf485c7d8386d},
keywords = {ChatGPT academic_publishing artificial_intelligence large_language_models scholarly_literature},
language = {en},
month = mar,
primaryclass = {cs.DL},
timestamp = {2024-03-26T15:49:35.000+0100},
title = {ChatGPT "contamination": estimating the prevalence of LLMs in the scholarly literature},
url = {https://arxiv.org/abs/2403.16887},
year = 2024
}