@inproceedings{bagdasarov-etal-2026-llms, title = {Using LLMs for Automatic Discipline Annotation in a Diachronic Corpus of English Scientific Papers}, author = {Sergei Bagdasarov and Diego Alves and Stefan Fischer and Elke Teich}, editor = {Stelios Piperidis and Núria Bel and Henk van den Heuvel and Nancy Ide and Simon Krek and Antonio Toral}, url = {https://lrec.elra.info/lrec2026-main-187}, doi = {https://doi.org/10.63317/3j9wvu86v48t}, year = {2026}, date = {2026}, booktitle = {Proceedings of the Fifteenth Language Resources and Evaluation Conference (LREC 2026)}, pages = {2376--2386}, publisher = {European Language Resources Association (ELRA)}, address = {Palma, Mallorca, Spain}, abstract = {
This study investigates the potential of generative large language models (LLMs) to automatically identify the disciplines of scientific papers in the Royal Society Corpus (RSC) – an extensive collection of English scientific publications spanning more than three centuries. We evaluated eight open-source, state-of-the-art LLMs from four model families on a manually annotated subset and further validated the three best-performing models on a corpus of modern scientific texts. These models were subsequently used for large-scale annotation of the RSC. The models exhibited robust and consistent performance, with at least two LLMs agreeing on the same label for 98.3% of the documents. We then conducted an error analysis of papers assigned divergent labels and a diachronic case study of disciplinary trends within the corpus. The error analysis revealed that most discrepancies occurred in twentieth-century texts, reflecting the growing interdisciplinarity of research. The diachronic analysis showed a gradual decline in disciplinary diversity over time as well as fluctuations corresponding to major paradigm shifts such as the Chemical Revolution and key twentieth-century developments in Physics. The discipline labels generated by the three models will be made publicly available.