@inproceedings{bagdasarov-etal-2026-llms,
title = {Using LLMs for Automatic Discipline Annotation in a Diachronic Corpus of English Scientific Papers},
author = {Sergei Bagdasarov and Diego Alves and Stefan Fischer and Elke Teich},
editor = {Stelios Piperidis and Núria Bel and Henk van den Heuvel and Nancy Ide and Simon Krek and Antonio Toral},
url = {https://lrec.elra.info/lrec2026-main-187},
doi = {https://doi.org/10.63317/3j9wvu86v48t},
year = {2026},
date = {2026},
booktitle = {Proceedings of the Fifteenth Language Resources and Evaluation Conference (LREC 2026)},
pages = {2376--2386},
publisher = {European Language Resources Association (ELRA)},
address = {Palma, Mallorca, Spain},
abstract = {<div class="rounded-lg border bg-card p-6">
<p class="leading-relaxed whitespace-pre-line">This study investigates the potential of generative large language models (LLMs) to automatically identify the disciplines of scientific papers in the Royal Society Corpus (RSC) – an extensive collection of English scientific publications spanning more than three centuries. We evaluated eight open-source, state-of-the-art LLMs from four model families on a manually annotated subset and further validated the three best-performing models on a corpus of modern scientific texts. These models were subsequently used for large-scale annotation of the RSC. The models exhibited robust and consistent performance, with at least two LLMs agreeing on the same label for 98.3% of the documents. We then conducted an error analysis of papers assigned divergent labels and a diachronic case study of disciplinary trends within the corpus. The error analysis revealed that most discrepancies occurred in twentieth-century texts, reflecting the growing interdisciplinarity of research. The diachronic analysis showed a gradual decline in disciplinary diversity over time as well as fluctuations corresponding to major paradigm shifts such as the Chemical Revolution and key twentieth-century developments in Physics. The discipline labels generated by the three models will be made publicly available.

</div>},
pubstate = {published},
type = {inproceedings}
}