Publications

Alves, Diego

Information Theory and Linguistic Variation: A Study of Brazilian and European Portuguese Inproceedings

Scherrer, Yves; Jauhiainen, Tommi; Ljubešić, Nikola; Nakov, Preslav; Tiedemann, Jorg; Zampieri, Marcos (Ed.): Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects, Association for Computational Linguistics, pp. 9-19, Abu Dhabi, UAE, 2025.

We present a general analysis of the lexical and grammatical differences between Brazilian and European Portuguese by applying entropy measures, including Kullback-Leibler divergence and word order entropy, across various linguistic levels. Using a parallel corpus of BP and EP sentences translated from English, we quantified these differences and identified characteristic phenomena underlying the divergences between the two varieties. The highest divergence was observed at the lexical level due to word pairs unique to each variety but also related to grammatical distinctions. Furthermore, the analysis of parts-of-speech (POS), dependency relations, and POS tri-grams provided information concerning distinctive grammatical constructions. Finally, the word order entropy analysis revealed that while most of the syntactic features analysed showed similar patterns across BP and EP, specific word order preferences were still apparent.

@inproceedings{alves-2025-information,
title = {Information Theory and Linguistic Variation: A Study of Brazilian and European Portuguese},
author = {Diego Alves},
editor = {Yves Scherrer and Tommi Jauhiainen and Nikola Ljubešić and Preslav Nakov and Jorg Tiedemann and Marcos Zampieri},
url = {https://aclanthology.org/2025.vardial-1.2/},
year = {2025},
date = {2025},
booktitle = {Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects},
pages = {9-19},
publisher = {Association for Computational Linguistics},
address = {Abu Dhabi, UAE},
abstract = {We present a general analysis of the lexical and grammatical differences between Brazilian and European Portuguese by applying entropy measures, including Kullback-Leibler divergence and word order entropy, across various linguistic levels. Using a parallel corpus of BP and EP sentences translated from English, we quantified these differences and identified characteristic phenomena underlying the divergences between the two varieties. The highest divergence was observed at the lexical level due to word pairs unique to each variety but also related to grammatical distinctions. Furthermore, the analysis of parts-of-speech (POS), dependency relations, and POS tri-grams provided information concerning distinctive grammatical constructions. Finally, the word order entropy analysis revealed that while most of the syntactic features analysed showed similar patterns across BP and EP, specific word order preferences were still apparent.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Alves, Diego

Diachronic Analysis of Phrasal Verbs in English Scientific Writing Inproceedings

Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), University of Tartu Library, Tallinn, Estonia, 2025.
Phrasal verbs (PVs) are a specific type of multi-word expressions and a specific feature of the English language. However, their usage in scientific prose is limited. Our study focuses on the analysis of phrasal verbs in the scientific domain using information theory methods to describe diachronic phenomena such as conventionalization and diversification regarding the usage of PVs. Thus, we analysed their developmental trajectory over time from the mid-17th century to the end of the 20th century by measuring the relative entropy (Kullback-Leibler divergence), predictability in context of the phrasal verbs particles (surprisal), and the paradigmatic variability using word embedding spaces. We were able to identify interesting phenomena such as the process of conventionalization over the 20th century and the peaks of diversification throughout the centuries.

@inproceedings{Alves-2025,
title = {Diachronic Analysis of Phrasal Verbs in English Scientific Writing},
author = {Diego Alves},
url = {https://dspace.ut.ee/items/ef26bd7f-e708-41b3-b5c8-84cf8057ab71},
year = {2025},
date = {2025},
booktitle = {Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)},
publisher = {University of Tartu Library},
address = {Tallinn, Estonia},
abstract = {

Phrasal verbs (PVs) are a specific type of multi-word expressions and a specific feature of the English language. However, their usage in scientific prose is limited. Our study focuses on the analysis of phrasal verbs in the scientific domain using information theory methods to describe diachronic phenomena such as conventionalization and diversification regarding the usage of PVs. Thus, we analysed their developmental trajectory over time from the mid-17th century to the end of the 20th century by measuring the relative entropy (Kullback-Leibler divergence), predictability in context of the phrasal verbs particles (surprisal), and the paradigmatic variability using word embedding spaces. We were able to identify interesting phenomena such as the process of conventionalization over the 20th century and the peaks of diversification throughout the centuries.
},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Noun + noun Compounds and Verbal Complements as Non-normalised Features in Late Modern English Scientific Translations Inproceedings

Proceedings of 7th Translation in Transition Conference, Batumi: Shota Rustaveli State University, 2024.

This paper presents a study on the usage of noun+noun compounds and verbal complement structures in 18th century scientific articles in the Royal Society Corpus (RSC) comparing translated to non-translated English texts. Departing from the hypothesis that the translations will conform stronger to traditional patterns of the English language, the analysis shows that these historical translations and non-translated texts are similarly marked by the ongoing reorganisation of the noun phrase, but translations
contain more innovative complementation patterns. Additionally, a surprisal analysis shows that the analysed patterns tend to occur in more predictable and conventionalised contexts in non-translated texts than in translation.

 

@inproceedings{Menzel2024Noun,
title = {Noun + noun Compounds and Verbal Complements as Non-normalised Features in Late Modern English Scientific Translations},
author = {Katrin Menzel},
url = {https://sites.google.com/view/tt2024/schedule-and-proceedings},
year = {2024},
date = {2024-12-26},
booktitle = {Proceedings of 7th Translation in Transition Conference},
address = {Batumi: Shota Rustaveli State University},
abstract = {This paper presents a study on the usage of noun+noun compounds and verbal complement structures in 18th century scientific articles in the Royal Society Corpus (RSC) comparing translated to non-translated English texts. Departing from the hypothesis that the translations will conform stronger to traditional patterns of the English language, the analysis shows that these historical translations and non-translated texts are similarly marked by the ongoing reorganisation of the noun phrase, but translations
contain more innovative complementation patterns. Additionally, a surprisal analysis shows that the analysed patterns tend to occur in more predictable and conventionalised contexts in non-translated texts than in translation.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Initialisms in Scientific Writing in the 19th and Early 20th Centuries Journal Article

Zeitschrift für Wortbildung / Journal of Word Formation (ZWJW) (Special issue Historical English Word-Formation), 8, pp. 7-27, 2024.
This paper focusses on the role of initialisms in scientific English articles in the Royal Society Corpus (Fischer et al. 2020; Kermes et al. 2016). The development of scientific initialisms is illustrated with frequency data, a discussion of the evolution of the text topics obtained from topic modelling and an analysis of the development of information-theoretic surprisal values of initialisms in three time spans between 1830 and 1919. The overall frequency and diversity of initialisms for scientific concepts has risen considerably between 1830 and 1919 in the context of the ongoing specialisation of the sciences. Particularly from the 1860s onwards scientific initialisms increasingly become shortcuts for multiword units with wordhood and term status. The surprisal values of scientific initialisms decrease over time as such forms more regularly occur in conventionalised textual contexts and fixed expressions. Overall, the analysis of the RSC texts shows that key developments towards the conventionalisation of scientific initialisms as term formation patterns took place in the transitional period from Late Modern to Present-day English.

@article{Menzel2024,
title = {Initialisms in Scientific Writing in the 19th and Early 20th Centuries},
author = {Katrin Menzel},
url = {https://journals.linguistik.de/zwjw/article/view/108},
doi = {https://doi.org/10.21248/zwjw.2024.2.108},
year = {2024},
date = {2024},
journal = {Zeitschrift f{\"u}r Wortbildung / Journal of Word Formation (ZWJW) (Special issue Historical English Word-Formation)},
pages = {7-27},
volume = {8},
number = {2},
abstract = {

This paper focusses on the role of initialisms in scientific English articles in the Royal Society Corpus (Fischer et al. 2020; Kermes et al. 2016). The development of scientific initialisms is illustrated with frequency data, a discussion of the evolution of the text topics obtained from topic modelling and an analysis of the development of information-theoretic surprisal values of initialisms in three time spans between 1830 and 1919. The overall frequency and diversity of initialisms for scientific concepts has risen considerably between 1830 and 1919 in the context of the ongoing specialisation of the sciences. Particularly from the 1860s onwards scientific initialisms increasingly become shortcuts for multiword units with wordhood and term status. The surprisal values of scientific initialisms decrease over time as such forms more regularly occur in conventionalised textual contexts and fixed expressions. Overall, the analysis of the RSC texts shows that key developments towards the conventionalisation of scientific initialisms as term formation patterns took place in the transitional period from Late Modern to Present-day English.
},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Steuer, Julius; Krielke, Marie-Pauline; Fischer, Stefan; Degaetano-Ortlieb, Stefania; Mosbach, Marius; Klakow, Dietrich

Modeling Diachronic Change in English Scientific Writing over 300+ Years with Transformer-based Language Model Surprisal Inproceedings

Zweigenbaum, Pierre; Rapp, Reinhard; Sharoff, Serge (Ed.): Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024, ELRA and ICCL, pp. 12-23, Torino, Italia, 2024.

This study presents an analysis of diachronic linguistic changes in English scientific writing, utilizing surprisal from transformer-based language models. Unlike traditional n-gram models, transformer-based models are potentially better at capturing nuanced linguistic changes such as long-range dependencies by considering variable context sizes. However, to create diachronically comparable language models there are several challenges with historical data, notably an exponential increase in no. of texts, tokens per text and vocabulary size over time. We address these by using a shared vocabulary and employing a robust training strategy that includes initial uniform sampling from the corpus and continuing pre-training on specific temporal segments. Our empirical analysis highlights the predictive power of surprisal from transformer-based models, particularly in analyzing complex linguistic structures like relative clauses. The models’ broader contextual awareness and the inclusion of dependency length annotations contribute to a more intricate understanding of communicative efficiency. While our focus is on scientific English, our approach can be applied to other low-resource scenarios.

@inproceedings{steuer-etal-2024-modeling ,
title = {Modeling Diachronic Change in English Scientific Writing over 300+ Years with Transformer-based Language Model Surprisal},
author = {Julius Steuer and Marie-Pauline Krielke and Stefan Fischer and Stefania Degaetano-Ortlieb and Marius Mosbach and Dietrich Klakow},
editor = {Pierre Zweigenbaum and Reinhard Rapp and Serge Sharoff},
url = {https://aclanthology.org/2024.bucc-1.2/},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024},
pages = {12-23},
publisher = {ELRA and ICCL},
address = {Torino, Italia},
abstract = {This study presents an analysis of diachronic linguistic changes in English scientific writing, utilizing surprisal from transformer-based language models. Unlike traditional n-gram models, transformer-based models are potentially better at capturing nuanced linguistic changes such as long-range dependencies by considering variable context sizes. However, to create diachronically comparable language models there are several challenges with historical data, notably an exponential increase in no. of texts, tokens per text and vocabulary size over time. We address these by using a shared vocabulary and employing a robust training strategy that includes initial uniform sampling from the corpus and continuing pre-training on specific temporal segments. Our empirical analysis highlights the predictive power of surprisal from transformer-based models, particularly in analyzing complex linguistic structures like relative clauses. The models’ broader contextual awareness and the inclusion of dependency length annotations contribute to a more intricate understanding of communicative efficiency. While our focus is on scientific English, our approach can be applied to other low-resource scenarios.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   B1 B4

Bagdasarov, Sergei; Teich, Elke

Multi-word expressions in biomedical abstracts and their plain English adaptations Inproceedings

Hämäläinen, Mika; Öhman, Emily; Miyagawa, So; Alnajjar, Khalid; Bizzoni, Yuri (Ed.): Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities, Association for Computational Linguistics, pp. 483-488, Miami, USA, 2024.

This study analyzes the use of multi-word expressions (MWEs), prefabricated sequences of words (e.g. in this case, this means that, healthcare service, follow up) in biomedical abstracts and their plain language adaptations. While English academic writing became highly specialized and complex from the late 19th century onwards, recent decades have seen a rising demand for a lay-friendly language in scientific content, especially in the health domain, to bridge a communication gap between experts and laypersons. Based on previous research showing that MWEs are easier to process than non-formulaic word sequences of comparable length, we hypothesize that they can potentially be used to create a more reader-friendly language. Our preliminary results suggest some significant differences between complex and plain abstracts when it comes to the usage patterns and informational load of MWEs.

@inproceedings{bagdasarov-teich-2024-multi,
title = {Multi-word expressions in biomedical abstracts and their plain English adaptations},
author = {Sergei Bagdasarov and Elke Teich},
editor = {Mika H{\"a}m{\"a}l{\"a}inen and Emily {\"O}hman and So Miyagawa and Khalid Alnajjar and Yuri Bizzoni},
url = {https://aclanthology.org/2024.nlp4dh-1.46/},
doi = {https://doi.org/10.18653/v1/2024.nlp4dh-1.46},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities},
pages = {483-488},
publisher = {Association for Computational Linguistics},
address = {Miami, USA},
abstract = {This study analyzes the use of multi-word expressions (MWEs), prefabricated sequences of words (e.g. in this case, this means that, healthcare service, follow up) in biomedical abstracts and their plain language adaptations. While English academic writing became highly specialized and complex from the late 19th century onwards, recent decades have seen a rising demand for a lay-friendly language in scientific content, especially in the health domain, to bridge a communication gap between experts and laypersons. Based on previous research showing that MWEs are easier to process than non-formulaic word sequences of comparable length, we hypothesize that they can potentially be used to create a more reader-friendly language. Our preliminary results suggest some significant differences between complex and plain abstracts when it comes to the usage patterns and informational load of MWEs.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Alves, Diego; Degaetano-Ortlieb, Stefania; Schmidt, Elena; Teich, Elke

Diachronic Analysis of Multi-word Expression Functional Categories in Scientific English Inproceedings

Bhatia, Archna; Bouma, Gosse; Seza Dogruoz, A.; Evang, Kilian; Garcia, Marcos; Giouli, Voula; Han, Lifeng; Nivre, Joakim; Rademaker, Alexandre (Ed.): Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, ELRA and ICCL, pp. 81-87, Torino, Italia, 2024.

We present a diachronic analysis of multi-word expressions (MWEs) in English based on the Royal Society Corpus, a dataset containing 300+ years of the scientific publications of the Royal Society of London. Specifically, we investigate the functions of MWEs, such as stance markers (“is is interesting”) or discourse organizers (“in this section”), and their development over time. Our approach is multi-disciplinary: to detect MWEs we use Universal Dependencies, to classify them functionally we use an approach from register linguistics, and to assess their role in diachronic development we use an information-theoretic measure, relative entropy.

@inproceedings{alves-etal-2024-diachronic,
title = {Diachronic Analysis of Multi-word Expression Functional Categories in Scientific English},
author = {Diego Alves and Stefania Degaetano-Ortlieb and Elena Schmidt and Elke Teich},
editor = {Archna Bhatia and Gosse Bouma and A. Seza Dogruoz and Kilian Evang and Marcos Garcia and Voula Giouli and Lifeng Han and Joakim Nivre and Alexandre Rademaker},
url = {https://aclanthology.org/2024.mwe-1.12},
year = {2024},
date = {2024},
booktitle = {Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024},
pages = {81-87},
publisher = {ELRA and ICCL},
address = {Torino, Italia},
abstract = {We present a diachronic analysis of multi-word expressions (MWEs) in English based on the Royal Society Corpus, a dataset containing 300+ years of the scientific publications of the Royal Society of London. Specifically, we investigate the functions of MWEs, such as stance markers (“is is interesting”) or discourse organizers (“in this section”), and their development over time. Our approach is multi-disciplinary: to detect MWEs we use Universal Dependencies, to classify them functionally we use an approach from register linguistics, and to assess their role in diachronic development we use an information-theoretic measure, relative entropy.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Bagdasarov, Sergei; Degaetano-Ortlieb, Stefania

Applying Information-theoretic Notions to Measure Effects of the Plain English Movement on English Law Reports and Scientific Articles Inproceedings

Bizzoni, Yuri; Degaetano-Ortlieb, Stefania; Kazantseva, Anna; Szpakowicz, Stan (Ed.): Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024), Association for Computational Linguistics, pp. 101-110, St. Julians, Malta, 2024.

We investigate the impact of the Plain English Movement (PEM) on the complexity of legal language in UK law reports from the 1950s-2010s, contrasting it with the evolution of scientific language. The PEM, emerging in the late 20th century, advocated for clear and understandable legal language. We define complexity through the concept of surprisal – an information-theoretic measure correlating with cognitive processing difficulty. Our research contrasts surprisal with traditional readability measures, which often overlook content. We hypothesize that, if the PEM has influenced legal language, there would be a reduction in complexity over time and a shift from a nominal to a more verbal style. We analyze text complexity and lexico-grammatical changes in line with PEM recommendations. Results indicate minimal impact of the PEM on both legal and scientific domains. This finding suggests future research should consider processing effort when advocating for linguistic norms to enhance accessibility.

@inproceedings{bagdasarov-degaetano-ortlieb-2024-applying,
title = {Applying Information-theoretic Notions to Measure Effects of the Plain English Movement on English Law Reports and Scientific Articles},
author = {Sergei Bagdasarov and Stefania Degaetano-Ortlieb},
editor = {Yuri Bizzoni and Stefania Degaetano-Ortlieb and Anna Kazantseva and Stan Szpakowicz},
url = {https://aclanthology.org/2024.latechclfl-1.11},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024)},
pages = {101-110},
publisher = {Association for Computational Linguistics},
address = {St. Julians, Malta},
abstract = {We investigate the impact of the Plain English Movement (PEM) on the complexity of legal language in UK law reports from the 1950s-2010s, contrasting it with the evolution of scientific language. The PEM, emerging in the late 20th century, advocated for clear and understandable legal language. We define complexity through the concept of surprisal - an information-theoretic measure correlating with cognitive processing difficulty. Our research contrasts surprisal with traditional readability measures, which often overlook content. We hypothesize that, if the PEM has influenced legal language, there would be a reduction in complexity over time and a shift from a nominal to a more verbal style. We analyze text complexity and lexico-grammatical changes in line with PEM recommendations. Results indicate minimal impact of the PEM on both legal and scientific domains. This finding suggests future research should consider processing effort when advocating for linguistic norms to enhance accessibility.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Alves, Diego; Fischer, Stefan; Degaetano-Ortlieb, Stefania; Teich, Elke

Multi-word Expressions in English Scientific Writing Inproceedings

Bizzoni, Yuri; Degaetano-Ortlieb, Stefania; Kazantseva, Anna; Szpakowicz, Stan (Ed.): Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024), Association for Computational Linguistics, pp. 67-76, St. Julians, Malta, 2024.

Multi-Word Expressions (MWEs) play a pivotal role in language use overall and in register formation more specifically, e.g. encoding field-specific terminology. Our study focuses on the identification and categorization of MWEs used in scientific writing, considering their formal characteristics as well as their developmental trajectory over time from the mid-17th century to the present. For this, we develop an approach combining three different types of methods to identify MWEs (Universal Dependency annotation, Partitioner and the Academic Formulas List) and selected measures to characterize MWE properties (e.g., dispersion by Kullback-Leibler Divergence and several association measures). This allows us to inspect MWEs types in a novel data-driven way regarding their functions and change over time in specialized discourse.

@inproceedings{alves-etal-2024-multi,
title = {Multi-word Expressions in English Scientific Writing},
author = {Diego Alves and Stefan Fischer and Stefania Degaetano-Ortlieb and Elke Teich},
editor = {Yuri Bizzoni and Stefania Degaetano-Ortlieb and Anna Kazantseva and Stan Szpakowicz},
url = {https://aclanthology.org/2024.latechclfl-1.8},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024)},
pages = {67-76},
publisher = {Association for Computational Linguistics},
address = {St. Julians, Malta},
abstract = {Multi-Word Expressions (MWEs) play a pivotal role in language use overall and in register formation more specifically, e.g. encoding field-specific terminology. Our study focuses on the identification and categorization of MWEs used in scientific writing, considering their formal characteristics as well as their developmental trajectory over time from the mid-17th century to the present. For this, we develop an approach combining three different types of methods to identify MWEs (Universal Dependency annotation, Partitioner and the Academic Formulas List) and selected measures to characterize MWE properties (e.g., dispersion by Kullback-Leibler Divergence and several association measures). This allows us to inspect MWEs types in a novel data-driven way regarding their functions and change over time in specialized discourse.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Krielke, Marie-Pauline

Cross-linguistic Dependency Length Minimization in scientific language: Syntactic complexity reduction in English and German in the Late Modern period Journal Article

Languages in Contrast, 24, pp. 133 - 163, 2024, ISSN 1387-6759.

We use Universal Dependencies (UD) for the study of cross-linguistic diachronic syntactic complexity reduction. Specifically, we look at whether and how scientific English and German minimize the length of syntactic dependency relations in the Late Modern period (ca. 1650–1900). Our linguistic analysis follows the assumption that over time, scientific discourse cross-linguistically develops towards an increasingly efficient syntactic code by minimizing Dependency Length (DL) as a factor of syntactic complexity. For each language, we analyse a large UD-annotated scientific and general language corpus for comparison. While on a macro level, our analysis suggests that there is an overall diachronic cross-linguistic and cross-register reduction in Average Dependency Length (ADL), on the micro level we find that only scientific language shows a sentence length independent reduction of ADL, while general language shows an overall decrease of ADL due to sentence length reduction. We further analyse the syntactic constructions responsible for this reduction in both languages, showing that both scientific English and German increasingly make use of short, intra-phrasal dependency relations while long dependency relations such as clausal embeddings become rather disfavoured over time.

@article{Krielke-2024,
title = {Cross-linguistic Dependency Length Minimization in scientific language: Syntactic complexity reduction in English and German in the Late Modern period},
author = {Marie-Pauline Krielke},
url = {https://www.jbe-platform.com/content/journals/10.1075/lic.00038.kri},
doi = {https://doi.org/10.1075/lic.00038.kri},
year = {2024},
date = {2024},
journal = {Languages in Contrast},
pages = {133 - 163},
volume = {24},
number = {1},
abstract = {

We use Universal Dependencies (UD) for the study of cross-linguistic diachronic syntactic complexity reduction. Specifically, we look at whether and how scientific English and German minimize the length of syntactic dependency relations in the Late Modern period (ca. 1650–1900). Our linguistic analysis follows the assumption that over time, scientific discourse cross-linguistically develops towards an increasingly efficient syntactic code by minimizing Dependency Length (DL) as a factor of syntactic complexity. For each language, we analyse a large UD-annotated scientific and general language corpus for comparison. While on a macro level, our analysis suggests that there is an overall diachronic cross-linguistic and cross-register reduction in Average Dependency Length (ADL), on the micro level we find that only scientific language shows a sentence length independent reduction of ADL, while general language shows an overall decrease of ADL due to sentence length reduction. We further analyse the syntactic constructions responsible for this reduction in both languages, showing that both scientific English and German increasingly make use of short, intra-phrasal dependency relations while long dependency relations such as clausal embeddings become rather disfavoured over time.

},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Krielke, Marie-Pauline

Optimizing scientific communication: the role of relative clauses as markers of complexity in English and German scientific writing between 1650 and 1900 PhD Thesis

Saarland University, Saarbruecken, Germany, 2023.

The aim of this thesis is to show that both scientific English and German have become increasingly optimized for scientific communication from 1650 to 1900 by adapting the usage of relative clauses as markers of grammatical complexity. While the lexico-grammatical changes in terms of features and their frequency distribution in scientific writing during this period are well documented, in the present work we are interested in the underlying factors driving these changes and how they affect efficient scientific communication. As the scientific register emerges and evolves, it continuously adapts to the changing communicative needs posed by extra-linguistic pressures arising from the scientific community and its achievements. We assume that, over time, scientific language maintains communicative efficiency by balancing lexico-semantic expansion with a reduction in (lexico-)grammatical complexity on different linguistic levels. This is based on the idea that linguistic complexity affects processing difficulty and, in turn, communicative efficiency. To achieve optimization, complexity is adjusted on the level of lexico-grammar, which is related to expectation-based processing cost, and syntax, which is linked to working memory-based processing cost. We conduct five corpus-based studies comparing English and German scientific writing to general language. The first two investigate the development of relative clauses in terms of lexico-grammar, measuring the paradigmatic richness and syntagmatic predictability of relativizers as indicators of expectation-based processing cost. The results confirm that both levels undergo a reduction in complexity over time. The other three studies focus on the syntactic complexity of relative clauses, investigating syntactic intricacy, locality, and accessibility. Results show that intricacy and locality decrease, leading to lower grammatical complexity and thus mitigating memory-based processing cost. However, accessibility is not a factor of complexity reduction over time. Our studies reveal a register-specific diachronic complexity reduction in scientific language both in lexico-grammar and syntax. The cross-linguistic comparison shows that English is more advanced in its register-specific development while German lags behind due to a later establishment of the vernacular as a language of scientific communication.

@phdthesis{Krielke_Diss_2023,
title = {Optimizing scientific communication: the role of relative clauses as markers of complexity in English and German scientific writing between 1650 and 1900},
author = {Marie-Pauline Krielke},
url = {https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/36825},
doi = {https://doi.org/10.22028/D291-40997},
year = {2023},
date = {2023},
school = {Saarland University},
address = {Saarbruecken, Germany},
abstract = {The aim of this thesis is to show that both scientific English and German have become increasingly optimized for scientific communication from 1650 to 1900 by adapting the usage of relative clauses as markers of grammatical complexity. While the lexico-grammatical changes in terms of features and their frequency distribution in scientific writing during this period are well documented, in the present work we are interested in the underlying factors driving these changes and how they affect efficient scientific communication. As the scientific register emerges and evolves, it continuously adapts to the changing communicative needs posed by extra-linguistic pressures arising from the scientific community and its achievements. We assume that, over time, scientific language maintains communicative efficiency by balancing lexico-semantic expansion with a reduction in (lexico-)grammatical complexity on different linguistic levels. This is based on the idea that linguistic complexity affects processing difficulty and, in turn, communicative efficiency. To achieve optimization, complexity is adjusted on the level of lexico-grammar, which is related to expectation-based processing cost, and syntax, which is linked to working memory-based processing cost. We conduct five corpus-based studies comparing English and German scientific writing to general language. The first two investigate the development of relative clauses in terms of lexico-grammar, measuring the paradigmatic richness and syntagmatic predictability of relativizers as indicators of expectation-based processing cost. The results confirm that both levels undergo a reduction in complexity over time. The other three studies focus on the syntactic complexity of relative clauses, investigating syntactic intricacy, locality, and accessibility. Results show that intricacy and locality decrease, leading to lower grammatical complexity and thus mitigating memory-based processing cost. However, accessibility is not a factor of complexity reduction over time. Our studies reveal a register-specific diachronic complexity reduction in scientific language both in lexico-grammar and syntax. The cross-linguistic comparison shows that English is more advanced in its register-specific development while German lags behind due to a later establishment of the vernacular as a language of scientific communication.},
pubstate = {published},
type = {phdthesis}
}

Copy BibTeX to Clipboard

Project:   B1

Hug, Marius; Rau, Felix; Debbeler, Anke; Saleh, Sara; Mollenhauer, Elisabeth; Leinen, Peter; Genêt, Philippe; Trippel, Thorsten; Zinn, Claus; Dogaru, George; Witt, Andreas; Werthmann, Antonina; Draxler, Christoph; Schiel, Florian; Knappen, Jörg; Fischer, Stefan; Krielke, Marie-Pauline; Teich, Elke; Barth, Florian; Calvo Tello, José; Funk, Stefan E.; Göbel, Mathias; Kurzawe, Daniel; Veentjer, Ubbo; Weimer, Lukas; Blätte, Andreas; Lehmberg, Timm

Wohin damit? Storing and reusing my language data: Minute Madness der Datenzentren Miscellaneous

Text+, Zenodo, pp. 1-12, Potsdam, 2023.

Präsentiert beim Workshop „Wohin damit? Storing and reusing my language data“ am 22. Juni 2023 in Mannheim. Die Präsentation wurde im Kontext der Arbeit des Vereins Nationale Forschungsdateninfrastruktur (NFDI) e.V. gehalten.

@miscellaneous{HugRauDebbeleretal.2023,
title = {Wohin damit? Storing and reusing my language data: Minute Madness der Datenzentren},
author = {Marius Hug and Felix Rau and Anke Debbeler and Sara Saleh and Elisabeth Mollenhauer and Peter Leinen and Philippe Genêt and Thorsten Trippel and Claus Zinn and George Dogaru and Andreas Witt and Antonina Werthmann and Christoph Draxler and Florian Schiel and J{\"o}rg Knappen and Stefan Fischer and Marie-Pauline Krielke and Elke Teich and Florian Barth and Jos{\'e} Calvo Tello and Stefan E. Funk and Mathias G{\"o}bel and Daniel Kurzawe and Ubbo Veentjer and Lukas Weimer and Andreas Bl{\"a}tte and Timm Lehmberg},
url = {https://nbn-resolving.org/urn:nbn:de:bsz:mh39-121108},
doi = {https://doi.org/10.5281/zenodo.8123896},
year = {2023},
date = {2023},
booktitle = {Text+},
pages = {1-12},
publisher = {Zenodo},
address = {Potsdam},
abstract = {Pr{\"a}sentiert beim Workshop "Wohin damit? Storing and reusing my language data" am 22. Juni 2023 in Mannheim. Die Pr{\"a}sentation wurde im Kontext der Arbeit des Vereins Nationale Forschungsdateninfrastruktur (NFDI) e.V. gehalten.},
pubstate = {published},
type = {miscellaneous}
}

Copy BibTeX to Clipboard

Project:   B1

Fischer, Stefan; Fankhauser, Peter; Teich, Elke

Multi-word expressions and language efficiency: an information-theoretic account Miscellaneous

DGfS Computerlinguistik Postersession, Köln, 2023.

Multi-word expressions (MWEs) are a cornerstone in conventionalized language use and vital for the perceived fluency of a message (Fillmore 1979). From a processing perspective, MWEs seem to have an advantage over arbitrary word sequences due to highly predictable transitions from one word to the next, or they may be perceived as wholes (see e.g. Siyanova-Chanturia et al. 2017). The emergence and use of specific MWEs is typically context-dependent and register-specific. In our work, we investigate MWEs in the scientific domain from a diachronic perspective, asking what is the contribution of MWEs in the development of “scientific language” (here: English)? We assume that over time scientific English develops an optimal code for scientific expert communication characterized by high information density (Halliday 2004; Teich et al. 2021). Using a large diachronic corpus of English scientific texts (Fischer et al. 2020), we work in a data-driven fashion using various established word association measures (e.g. log-likelihood, PMI) to identify and classify MWEs by time periods (e.g. 50-year periods). In a complementary step, we account for the environments of words using selected computational language models (statistical models, embeddings; cf. Fankhauser & Kupietz 2022). On this basis, we then analyse the informational characteristics of MWEs diachronically: The more conventionalized an MWE becomes, the lower its surprisal (higher predictability of the MWE) and the lower the uncertainty about an upcoming word within the MWE (entropy). We expect to see that while specific MWEs come and go over time, during their life cycles they will exhibit surprisal/entropy reduction, thus contributing to language efficiency.

@miscellaneous{Fischer_etal_2024,
title = {Multi-word expressions and language efficiency: an information-theoretic account},
author = {Stefan Fischer and Peter Fankhauser and Elke Teich},
url = {https://dgfs2023.uni-koeln.de/sites/dgfs2023/Booklet/AG_Beschreibungen-und-Abstracts/Description-Abstracts-CL.pdf},
year = {2023},
date = {2023},
booktitle = {DGfS Computerlinguistik Postersession},
address = {K{\"o}ln},
abstract = {Multi-word expressions (MWEs) are a cornerstone in conventionalized language use and vital for the perceived fluency of a message (Fillmore 1979). From a processing perspective, MWEs seem to have an advantage over arbitrary word sequences due to highly predictable transitions from one word to the next, or they may be perceived as wholes (see e.g. Siyanova-Chanturia et al. 2017). The emergence and use of specific MWEs is typically context-dependent and register-specific. In our work, we investigate MWEs in the scientific domain from a diachronic perspective, asking what is the contribution of MWEs in the development of “scientific language” (here: English)? We assume that over time scientific English develops an optimal code for scientific expert communication characterized by high information density (Halliday 2004; Teich et al. 2021). Using a large diachronic corpus of English scientific texts (Fischer et al. 2020), we work in a data-driven fashion using various established word association measures (e.g. log-likelihood, PMI) to identify and classify MWEs by time periods (e.g. 50-year periods). In a complementary step, we account for the environments of words using selected computational language models (statistical models, embeddings; cf. Fankhauser & Kupietz 2022). On this basis, we then analyse the informational characteristics of MWEs diachronically: The more conventionalized an MWE becomes, the lower its surprisal (higher predictability of the MWE) and the lower the uncertainty about an upcoming word within the MWE (entropy). We expect to see that while specific MWEs come and go over time, during their life cycles they will exhibit surprisal/entropy reduction, thus contributing to language efficiency.},
pubstate = {published},
type = {miscellaneous}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin; Krielke, Marie-Pauline; Degaetano-Ortlieb, Stefania

Synthetic and analytic adjective negation in English scientific journal articles: A diachronic perspective Journal Article

LEGE ARTIS: Language yesterday, today, tomorrow, VII, Trnava: University of SS Cyril and Methodius in Trnava, pp. 157-213, 2022, ISSN 2453-8035 .

This paper addresses the development of synthetic and analytic adjective negation in a corpus of English scientific articles from the mid-17th century towards the end of the 20th century. Analytic patterns of adjective negation are found to become less frequent in the language of scientific articles, but more conventionalised in their textual contexts. Conversely, prefixed negated adjectives are identified as more frequent and more diverse with regard to their contexts.

@article{menzel_2022_diachronicperspective,
title = {Synthetic and analytic adjective negation in English scientific journal articles: A diachronic perspective},
author = {Katrin Menzel and Marie-Pauline Krielke and Stefania Degaetano-Ortlieb},
url = {https://www.researchgate.net/publication/361099180_Synthetic_and_analytic_adjective_negation_in_English_scientific_journal_articles_A_diachronic_perspective},
year = {2022},
date = {2022},
journal = {LEGE ARTIS: Language yesterday, today, tomorrow},
pages = {157-213},
publisher = {Trnava: University of SS Cyril and Methodius in Trnava},
volume = {VII},
number = {1},
abstract = {This paper addresses the development of synthetic and analytic adjective negation in a corpus of English scientific articles from the mid-17th century towards the end of the 20th century. Analytic patterns of adjective negation are found to become less frequent in the language of scientific articles, but more conventionalised in their textual contexts. Conversely, prefixed negated adjectives are identified as more frequent and more diverse with regard to their contexts.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Krielke, Marie-Pauline; Talamo, Luigi; Fawzi, M.; Knappen, J.

Tracing Syntactic Change in the Scientific Genre: Two Universal Dependency-parsed Diachronic Corpora of Scientific English and German Inproceedings

LREC 2022, Marseille, France, 2022.

We present two comparable diachronic corpora of scientific English and German from the Late Modern Period (17th c.–19th c.) annotated with Universal Dependencies. We describe several steps of data pre-processing and evaluate the resulting parsing accuracy showing how our pre-processing steps significantly improve output quality. As a sanity check for the representativity of our data, we conduct a case study comparing previously gained insights on grammatical change in the scientific genre with our data. Our results reflect the often reported trend of English scientific discourse towards heavy noun phrases and a simplification of the sentence structure (Halliday, 1988; Halliday and Martin, 1993; Biber and Gray, 2011; Biber and Gray, 2016). We also show that this trend applies to German scientific discourse as well. The presented corpora are valuable resources suitable for the contrastive analysis of syntactic diachronic change in the scientific genre between 1650 and 1900. The presented pre-processing procedures and their evaluations are applicable to other languages and can be useful for a variety of Natural Language Processing tasks such as syntactic parsing.

@inproceedings{krielke-etal-2022-tracing,
title = {Tracing Syntactic Change in the Scientific Genre: Two Universal Dependency-parsed Diachronic Corpora of Scientific English and German},
author = {Marie-Pauline Krielke and Luigi Talamo andM. Fawzi and J. Knappen},
url = {https://aclanthology.org/2022.lrec-1.514/},
year = {2022},
date = {2022},
publisher = {LREC 2022},
address = {Marseille, France},
abstract = {We present two comparable diachronic corpora of scientific English and German from the Late Modern Period (17th c.–19th c.) annotated with Universal Dependencies. We describe several steps of data pre-processing and evaluate the resulting parsing accuracy showing how our pre-processing steps significantly improve output quality. As a sanity check for the representativity of our data, we conduct a case study comparing previously gained insights on grammatical change in the scientific genre with our data. Our results reflect the often reported trend of English scientific discourse towards heavy noun phrases and a simplification of the sentence structure (Halliday, 1988; Halliday and Martin, 1993; Biber and Gray, 2011; Biber and Gray, 2016). We also show that this trend applies to German scientific discourse as well. The presented corpora are valuable resources suitable for the contrastive analysis of syntactic diachronic change in the scientific genre between 1650 and 1900. The presented pre-processing procedures and their evaluations are applicable to other languages and can be useful for a variety of Natural Language Processing tasks such as syntactic parsing.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Medical discourse in Late Modern English: Insights from the Royal Society Corpus. Book Chapter

Hiltunen, Turo; Taavitsainen, Irma;  (Ed.): Corpus pragmatic studies on the history of medical discourse (Pragmatics & Beyond New Series; Vol. 330), John Benjamins, pp. 79-104, Amsterdam, 2022.

This chapter demonstrates how the Royal Society Corpus, a richly annotated corpus of around 48,000 English scientific journal articles covering more than 330 years, can be used for lexico-grammatical and pragmatic studies that contribute to a broader understanding of the development of medical research articles. The Late Modern English period together with several decades before and after this time frame was a productive period in the medical output of the Royal Society. This chapter addresses typical linguistic features of scientific journal articles from medical and related sciences from this period demonstrating their special status in the context of other traditional and emerging disciplines in the corpus data. Additionally, language usage and text-type conventions of historical medical research articles will be compared to the features of corpus texts on medical topics from Present-day English.

@inbook{MedicalDiscourse22,
title = {Medical discourse in Late Modern English: Insights from the Royal Society Corpus.},
author = {Katrin Menzel},
editor = {Turo Hiltunen and Irma Taavitsainen},
url = {https://benjamins.com/catalog/pbns.330},
year = {2022},
date = {2022},
booktitle = {Corpus pragmatic studies on the history of medical discourse (Pragmatics & Beyond New Series; Vol. 330)},
pages = {79-104},
publisher = {John Benjamins},
address = {Amsterdam},
abstract = {This chapter demonstrates how the Royal Society Corpus, a richly annotated corpus of around 48,000 English scientific journal articles covering more than 330 years, can be used for lexico-grammatical and pragmatic studies that contribute to a broader understanding of the development of medical research articles. The Late Modern English period together with several decades before and after this time frame was a productive period in the medical output of the Royal Society. This chapter addresses typical linguistic features of scientific journal articles from medical and related sciences from this period demonstrating their special status in the context of other traditional and emerging disciplines in the corpus data. Additionally, language usage and text-type conventions of historical medical research articles will be compared to the features of corpus texts on medical topics from Present-day English.},
pubstate = {published},
type = {inbook}
}

Copy BibTeX to Clipboard

Project:   B1

Degaetano-Ortlieb, Stefania

Measuring informativity: The rise of compounds as informationally dense structures in 20th century Scientific English Book Chapter

Soave, Elena; Biber, Douglas (Ed.): Corpus Approaches to Register Variation, Studies in Corpus Linguistics, 103, John Benjamins Publishing Company, pp. 291-312, 2021.

By applying data-driven methods based on information theory, this study adds to previous work on the development of the scientific register by measuring the informativity of alternative phrasal structures shown to be involved in change in language use in 20th-century Scientific English. The analysis based on data-driven periodization shows compounds to be distinctive grammatical structures from the 1920s onwards in Proceedings A of the Royal Society of London. Compounds not only increase in frequency, but also show higher informativity than their less dense prepositional counterparts. Results also show that the lower the informativity of particular items, the more alternative, more informationally dense options might be favoured (e.g., of-phrases vs. compounds) – striving for communicative efficiency thus being one force shaping the scientific register.

@inbook{Degaetano-Ortlieb2021,
title = {Measuring informativity: The rise of compounds as informationally dense structures in 20th century Scientific English},
author = {Stefania Degaetano-Ortlieb},
editor = {Elena Soave and Douglas Biber},
url = {https://benjamins.com/catalog/scl.103.11deg},
doi = {https://doi.org/10.1075/scl.103.11deg},
year = {2021},
date = {2021},
booktitle = {Corpus Approaches to Register Variation},
pages = {291-312},
publisher = {John Benjamins Publishing Company},
abstract = {By applying data-driven methods based on information theory, this study adds to previous work on the development of the scientific register by measuring the informativity of alternative phrasal structures shown to be involved in change in language use in 20th-century Scientific English. The analysis based on data-driven periodization shows compounds to be distinctive grammatical structures from the 1920s onwards in Proceedings A of the Royal Society of London. Compounds not only increase in frequency, but also show higher informativity than their less dense prepositional counterparts. Results also show that the lower the informativity of particular items, the more alternative, more informationally dense options might be favoured (e.g., of-phrases vs. compounds) – striving for communicative efficiency thus being one force shaping the scientific register.},
pubstate = {published},
type = {inbook}
}

Copy BibTeX to Clipboard

Project:   B1

Bizzoni, Yuri; Degaetano-Ortlieb, Stefania; Menzel, Katrin; Teich, Elke

The diffusion of scientific terms - tracing individuals' influence in the history of science for English Inproceedings

Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, Association for Computational Linguistics, pp. 120-127, Punta Cana, Dominican Republic (online), 2021.

Tracing the influence of individuals or groups in social networks is an increasingly popular task in sociolinguistic studies. While methods to determine someone’s influence in shortterm contexts (e.g., social media, on-line political debates) are widespread, influence in longterm contexts is less investigated and may be harder to capture. We study the diffusion of scientific terms in an English diachronic scientific corpus, applying Hawkes Processes to capture the role of individual scientists as „influencers“ or „influencees“ in the diffusion of new concepts. Our findings on two major scientific discoveries in chemistry and astronomy of the 18th century reveal that modelling both the introduction and diffusion of scientific terms in a historical corpus as Hawkes Processes allows detecting patterns of influence between authors on a long-term scale.

@inproceedings{bizzoni-etal-2021-diffusion,
title = {The diffusion of scientific terms - tracing individuals' influence in the history of science for English},
author = {Yuri Bizzoni and Stefania Degaetano-Ortlieb and Katrin Menzel and Elke Teich},
url = {https://aclanthology.org/2021.latechclfl-1.14},
doi = {https://doi.org/10.18653/v1/2021.latechclfl-1.14},
year = {2021},
date = {2021-11-30},
booktitle = {Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature},
pages = {120-127},
publisher = {Association for Computational Linguistics},
address = {Punta Cana, Dominican Republic (online)},
abstract = {Tracing the influence of individuals or groups in social networks is an increasingly popular task in sociolinguistic studies. While methods to determine someone's influence in shortterm contexts (e.g., social media, on-line political debates) are widespread, influence in longterm contexts is less investigated and may be harder to capture. We study the diffusion of scientific terms in an English diachronic scientific corpus, applying Hawkes Processes to capture the role of individual scientists as "influencers" or "influencees" in the diffusion of new concepts. Our findings on two major scientific discoveries in chemistry and astronomy of the 18th century reveal that modelling both the introduction and diffusion of scientific terms in a historical corpus as Hawkes Processes allows detecting patterns of influence between authors on a long-term scale.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin; Krielke, Marie-Pauline; Degaetano-Ortlieb, Stefania

Structural complexity in scientific journal articles across time - from negative clausal expressions towards adjectival negative prefixes Inproceedings

Workshop on Complexity and Register (CAR21), Berlin, Germany, CRC1412 Register, 2021.

@inproceedings{Menzel-etal2021,
title = {Structural complexity in scientific journal articles across time - from negative clausal expressions towards adjectival negative prefixes},
author = {Katrin Menzel and Marie-Pauline Krielke and Stefania Degaetano-Ortlieb},
year = {2021},
date = {2021-11-19},
booktitle = {Workshop on Complexity and Register (CAR21)},
address = {Berlin, Germany, CRC1412 Register},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Scientific Eponyms throughout the History of English Scholarly Journal Articles Book Chapter

Van de Velde, Hans; Dolezal, Fredric T.;  (Ed.): Broadening Perspectives in the History of Dictionaries and Word Studies, Cambridge Scholars Publishing, pp. 159-193, Newcastle upon Tyne, 2021, ISBN 1-5275-7432-6.

@inbook{Menzel2021_eponyms,
title = {Scientific Eponyms throughout the History of English Scholarly Journal Articles},
author = {Katrin Menzel},
editor = {Hans Van de Velde and Fredric T. Dolezal},
url = {https://www.cambridgescholars.com/product/978-1-5275-7432-8},
year = {2021},
date = {2021-11-08},
booktitle = {Broadening Perspectives in the History of Dictionaries and Word Studies},
isbn = {1-5275-7432-6},
pages = {159-193},
publisher = {Cambridge Scholars Publishing},
address = {Newcastle upon Tyne},
pubstate = {published},
type = {inbook}
}

Copy BibTeX to Clipboard

Project:   B1

Successfully