Publications

Krielke, Marie-Pauline

Optimizing scientific communication: the role of relative clauses as markers of complexity in English and German scientific writing between 1650 and 1900 PhD Thesis

Saarland University, Saarbruecken, Germany, 2023.

The aim of this thesis is to show that both scientific English and German have become increasingly optimized for scientific communication from 1650 to 1900 by adapting the usage of relative clauses as markers of grammatical complexity. While the lexico-grammatical changes in terms of features and their frequency distribution in scientific writing during this period are well documented, in the present work we are interested in the underlying factors driving these changes and how they affect efficient scientific communication. As the scientific register emerges and evolves, it continuously adapts to the changing communicative needs posed by extra-linguistic pressures arising from the scientific community and its achievements. We assume that, over time, scientific language maintains communicative efficiency by balancing lexico-semantic expansion with a reduction in (lexico-)grammatical complexity on different linguistic levels. This is based on the idea that linguistic complexity affects processing difficulty and, in turn, communicative efficiency. To achieve optimization, complexity is adjusted on the level of lexico-grammar, which is related to expectation-based processing cost, and syntax, which is linked to working memory-based processing cost. We conduct five corpus-based studies comparing English and German scientific writing to general language. The first two investigate the development of relative clauses in terms of lexico-grammar, measuring the paradigmatic richness and syntagmatic predictability of relativizers as indicators of expectation-based processing cost. The results confirm that both levels undergo a reduction in complexity over time. The other three studies focus on the syntactic complexity of relative clauses, investigating syntactic intricacy, locality, and accessibility. Results show that intricacy and locality decrease, leading to lower grammatical complexity and thus mitigating memory-based processing cost. However, accessibility is not a factor of complexity reduction over time. Our studies reveal a register-specific diachronic complexity reduction in scientific language both in lexico-grammar and syntax. The cross-linguistic comparison shows that English is more advanced in its register-specific development while German lags behind due to a later establishment of the vernacular as a language of scientific communication.

@phdthesis{Krielke_Diss_2023,
title = {Optimizing scientific communication: the role of relative clauses as markers of complexity in English and German scientific writing between 1650 and 1900},
author = {Marie-Pauline Krielke},
url = {https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/36825},
doi = {https://doi.org/10.22028/D291-40997},
year = {2023},
date = {2023},
school = {Saarland University},
address = {Saarbruecken, Germany},
abstract = {The aim of this thesis is to show that both scientific English and German have become increasingly optimized for scientific communication from 1650 to 1900 by adapting the usage of relative clauses as markers of grammatical complexity. While the lexico-grammatical changes in terms of features and their frequency distribution in scientific writing during this period are well documented, in the present work we are interested in the underlying factors driving these changes and how they affect efficient scientific communication. As the scientific register emerges and evolves, it continuously adapts to the changing communicative needs posed by extra-linguistic pressures arising from the scientific community and its achievements. We assume that, over time, scientific language maintains communicative efficiency by balancing lexico-semantic expansion with a reduction in (lexico-)grammatical complexity on different linguistic levels. This is based on the idea that linguistic complexity affects processing difficulty and, in turn, communicative efficiency. To achieve optimization, complexity is adjusted on the level of lexico-grammar, which is related to expectation-based processing cost, and syntax, which is linked to working memory-based processing cost. We conduct five corpus-based studies comparing English and German scientific writing to general language. The first two investigate the development of relative clauses in terms of lexico-grammar, measuring the paradigmatic richness and syntagmatic predictability of relativizers as indicators of expectation-based processing cost. The results confirm that both levels undergo a reduction in complexity over time. The other three studies focus on the syntactic complexity of relative clauses, investigating syntactic intricacy, locality, and accessibility. Results show that intricacy and locality decrease, leading to lower grammatical complexity and thus mitigating memory-based processing cost. However, accessibility is not a factor of complexity reduction over time. Our studies reveal a register-specific diachronic complexity reduction in scientific language both in lexico-grammar and syntax. The cross-linguistic comparison shows that English is more advanced in its register-specific development while German lags behind due to a later establishment of the vernacular as a language of scientific communication.},
pubstate = {published},
type = {phdthesis}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Translated texts in the Philosophical Transactions and Proceedings of the Royal Society from the 17th to the 20th century Miscellaneous

Online-Festschrift for Elke Teich on the Occasion of her 60th birthdayOnline-Festschrift for Elke Teich on the Occasion of her 60th birthday, Saarland University, 2023.

From antiquity to the modern era, translations have contributed to the production and circulation of scientific knowledge. From the time when the first English scientific journals began to appear in the mid-17th century, English translations of scientific texts by non-English-speaking authors began to appear in these journals. This paper highlights some aspects with regard to translations of scientific texts published in journals from the Royal Society of London between the 17th and the 20th century. The dataset used for the case study is the Royal Society Corpus (RSC) 6.0 / 7.0. Translations for the Royal Society journals were often produced by Fellows or secretaries of the Royal Society. Most translated articles found in the corpus were published in the Philosophical Transactions during the 18th century as translations from French. Overall, the translations in the RSC are characterised by domesticating practices.

@miscellaneous{Festschrift_Teich_Menzel,
title = {Translated texts in the Philosophical Transactions and Proceedings of the Royal Society from the 17th to the 20th century},
author = {Katrin Menzel},
url = {https://www.uni-saarland.de/fileadmin/upload/lehrstuhl/teich/degaetano/Festschrift_Teich_Menzel.pdf},
year = {2023},
date = {2023},
booktitle = {Online-Festschrift for Elke Teich on the Occasion of her 60th birthday},
address = {Saarland University},
abstract = {From antiquity to the modern era, translations have contributed to the production and circulation of scientific knowledge. From the time when the first English scientific journals began to appear in the mid-17th century, English translations of scientific texts by non-English-speaking authors began to appear in these journals. This paper highlights some aspects with regard to translations of scientific texts published in journals from the Royal Society of London between the 17th and the 20th century. The dataset used for the case study is the Royal Society Corpus (RSC) 6.0 / 7.0. Translations for the Royal Society journals were often produced by Fellows or secretaries of the Royal Society. Most translated articles found in the corpus were published in the Philosophical Transactions during the 18th century as translations from French. Overall, the translations in the RSC are characterised by domesticating practices.},
pubstate = {published},
type = {miscellaneous}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin; Krielke, Marie-Pauline; Degaetano-Ortlieb, Stefania

Synthetic and analytic adjective negation in English scientific journal articles: A diachronic perspective Journal Article

LEGE ARTIS: Language yesterday, today, tomorrow, VII, Trnava: University of SS Cyril and Methodius in Trnava, pp. 157-213, 2022, ISSN 2453-8035 .

This paper addresses the development of synthetic and analytic adjective negation in a corpus of English scientific articles from the mid-17th century towards the end of the 20th century. Analytic patterns of adjective negation are found to become less frequent in the language of scientific articles, but more conventionalised in their textual contexts. Conversely, prefixed negated adjectives are identified as more frequent and more diverse with regard to their contexts.

@article{menzel_2022_diachronicperspective,
title = {Synthetic and analytic adjective negation in English scientific journal articles: A diachronic perspective},
author = {Katrin Menzel and Marie-Pauline Krielke and Stefania Degaetano-Ortlieb},
url = {https://www.researchgate.net/publication/361099180_Synthetic_and_analytic_adjective_negation_in_English_scientific_journal_articles_A_diachronic_perspective},
year = {2022},
date = {2022},
journal = {LEGE ARTIS: Language yesterday, today, tomorrow},
pages = {157-213},
publisher = {Trnava: University of SS Cyril and Methodius in Trnava},
volume = {VII},
number = {1},
abstract = {This paper addresses the development of synthetic and analytic adjective negation in a corpus of English scientific articles from the mid-17th century towards the end of the 20th century. Analytic patterns of adjective negation are found to become less frequent in the language of scientific articles, but more conventionalised in their textual contexts. Conversely, prefixed negated adjectives are identified as more frequent and more diverse with regard to their contexts.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Krielke, Marie-Pauline; Talamo, Luigi; Fawzi, M.; Knappen, J.

Tracing Syntactic Change in the Scientific Genre: Two Universal Dependency-parsed Diachronic Corpora of Scientific English and German Inproceedings

LREC 2022, Marseille, France, 2022.

We present two comparable diachronic corpora of scientific English and German from the Late Modern Period (17th c.–19th c.) annotated with Universal Dependencies. We describe several steps of data pre-processing and evaluate the resulting parsing accuracy showing how our pre-processing steps significantly improve output quality. As a sanity check for the representativity of our data, we conduct a case study comparing previously gained insights on grammatical change in the scientific genre with our data. Our results reflect the often reported trend of English scientific discourse towards heavy noun phrases and a simplification of the sentence structure (Halliday, 1988; Halliday and Martin, 1993; Biber and Gray, 2011; Biber and Gray, 2016). We also show that this trend applies to German scientific discourse as well. The presented corpora are valuable resources suitable for the contrastive analysis of syntactic diachronic change in the scientific genre between 1650 and 1900. The presented pre-processing procedures and their evaluations are applicable to other languages and can be useful for a variety of Natural Language Processing tasks such as syntactic parsing.

@inproceedings{krielke-etal-2022-tracing,
title = {Tracing Syntactic Change in the Scientific Genre: Two Universal Dependency-parsed Diachronic Corpora of Scientific English and German},
author = {Marie-Pauline Krielke and Luigi Talamo andM. Fawzi and J. Knappen},
url = {https://aclanthology.org/2022.lrec-1.514/},
year = {2022},
date = {2022},
publisher = {LREC 2022},
address = {Marseille, France},
abstract = {We present two comparable diachronic corpora of scientific English and German from the Late Modern Period (17th c.–19th c.) annotated with Universal Dependencies. We describe several steps of data pre-processing and evaluate the resulting parsing accuracy showing how our pre-processing steps significantly improve output quality. As a sanity check for the representativity of our data, we conduct a case study comparing previously gained insights on grammatical change in the scientific genre with our data. Our results reflect the often reported trend of English scientific discourse towards heavy noun phrases and a simplification of the sentence structure (Halliday, 1988; Halliday and Martin, 1993; Biber and Gray, 2011; Biber and Gray, 2016). We also show that this trend applies to German scientific discourse as well. The presented corpora are valuable resources suitable for the contrastive analysis of syntactic diachronic change in the scientific genre between 1650 and 1900. The presented pre-processing procedures and their evaluations are applicable to other languages and can be useful for a variety of Natural Language Processing tasks such as syntactic parsing.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Medical discourse in Late Modern English: Insights from the Royal Society Corpus. Book Chapter

Hiltunen, Turo; Taavitsainen, Irma;  (Ed.): Corpus pragmatic studies on the history of medical discourse (Pragmatics & Beyond New Series; Vol. 330), John Benjamins, pp. 79-104, Amsterdam, 2022.

This chapter demonstrates how the Royal Society Corpus, a richly annotated corpus of around 48,000 English scientific journal articles covering more than 330 years, can be used for lexico-grammatical and pragmatic studies that contribute to a broader understanding of the development of medical research articles. The Late Modern English period together with several decades before and after this time frame was a productive period in the medical output of the Royal Society. This chapter addresses typical linguistic features of scientific journal articles from medical and related sciences from this period demonstrating their special status in the context of other traditional and emerging disciplines in the corpus data. Additionally, language usage and text-type conventions of historical medical research articles will be compared to the features of corpus texts on medical topics from Present-day English.

@inbook{MedicalDiscourse22,
title = {Medical discourse in Late Modern English: Insights from the Royal Society Corpus.},
author = {Katrin Menzel},
editor = {Turo Hiltunen and Irma Taavitsainen},
url = {https://benjamins.com/catalog/pbns.330},
year = {2022},
date = {2022},
booktitle = {Corpus pragmatic studies on the history of medical discourse (Pragmatics & Beyond New Series; Vol. 330)},
pages = {79-104},
publisher = {John Benjamins},
address = {Amsterdam},
abstract = {This chapter demonstrates how the Royal Society Corpus, a richly annotated corpus of around 48,000 English scientific journal articles covering more than 330 years, can be used for lexico-grammatical and pragmatic studies that contribute to a broader understanding of the development of medical research articles. The Late Modern English period together with several decades before and after this time frame was a productive period in the medical output of the Royal Society. This chapter addresses typical linguistic features of scientific journal articles from medical and related sciences from this period demonstrating their special status in the context of other traditional and emerging disciplines in the corpus data. Additionally, language usage and text-type conventions of historical medical research articles will be compared to the features of corpus texts on medical topics from Present-day English.},
pubstate = {published},
type = {inbook}
}

Copy BibTeX to Clipboard

Project:   B1

Degaetano-Ortlieb, Stefania

Measuring informativity: The rise of compounds as informationally dense structures in 20th century Scientific English Book Chapter

Soave, Elena; Biber, Douglas (Ed.): Corpus Approaches to Register Variation, Studies in Corpus Linguistics, 103, John Benjamins Publishing Company, pp. 291-312, 2021.

By applying data-driven methods based on information theory, this study adds to previous work on the development of the scientific register by measuring the informativity of alternative phrasal structures shown to be involved in change in language use in 20th-century Scientific English. The analysis based on data-driven periodization shows compounds to be distinctive grammatical structures from the 1920s onwards in Proceedings A of the Royal Society of London. Compounds not only increase in frequency, but also show higher informativity than their less dense prepositional counterparts. Results also show that the lower the informativity of particular items, the more alternative, more informationally dense options might be favoured (e.g., of-phrases vs. compounds) – striving for communicative efficiency thus being one force shaping the scientific register.

@inbook{Degaetano-Ortlieb2021,
title = {Measuring informativity: The rise of compounds as informationally dense structures in 20th century Scientific English},
author = {Stefania Degaetano-Ortlieb},
editor = {Elena Soave and Douglas Biber},
url = {https://benjamins.com/catalog/scl.103.11deg},
doi = {https://doi.org/10.1075/scl.103.11deg},
year = {2021},
date = {2021},
booktitle = {Corpus Approaches to Register Variation},
pages = {291-312},
publisher = {John Benjamins Publishing Company},
abstract = {By applying data-driven methods based on information theory, this study adds to previous work on the development of the scientific register by measuring the informativity of alternative phrasal structures shown to be involved in change in language use in 20th-century Scientific English. The analysis based on data-driven periodization shows compounds to be distinctive grammatical structures from the 1920s onwards in Proceedings A of the Royal Society of London. Compounds not only increase in frequency, but also show higher informativity than their less dense prepositional counterparts. Results also show that the lower the informativity of particular items, the more alternative, more informationally dense options might be favoured (e.g., of-phrases vs. compounds) – striving for communicative efficiency thus being one force shaping the scientific register.},
pubstate = {published},
type = {inbook}
}

Copy BibTeX to Clipboard

Project:   B1

Bizzoni, Yuri; Degaetano-Ortlieb, Stefania; Menzel, Katrin; Teich, Elke

The diffusion of scientific terms - tracing individuals' influence in the history of science for English Inproceedings

Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, Association for Computational Linguistics, pp. 120-127, Punta Cana, Dominican Republic (online), 2021.

Tracing the influence of individuals or groups in social networks is an increasingly popular task in sociolinguistic studies. While methods to determine someone’s influence in shortterm contexts (e.g., social media, on-line political debates) are widespread, influence in longterm contexts is less investigated and may be harder to capture. We study the diffusion of scientific terms in an English diachronic scientific corpus, applying Hawkes Processes to capture the role of individual scientists as „influencers“ or „influencees“ in the diffusion of new concepts. Our findings on two major scientific discoveries in chemistry and astronomy of the 18th century reveal that modelling both the introduction and diffusion of scientific terms in a historical corpus as Hawkes Processes allows detecting patterns of influence between authors on a long-term scale.

@inproceedings{bizzoni-etal-2021-diffusion,
title = {The diffusion of scientific terms - tracing individuals' influence in the history of science for English},
author = {Yuri Bizzoni and Stefania Degaetano-Ortlieb and Katrin Menzel and Elke Teich},
url = {https://aclanthology.org/2021.latechclfl-1.14},
doi = {https://doi.org/10.18653/v1/2021.latechclfl-1.14},
year = {2021},
date = {2021-11-30},
booktitle = {Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature},
pages = {120-127},
publisher = {Association for Computational Linguistics},
address = {Punta Cana, Dominican Republic (online)},
abstract = {Tracing the influence of individuals or groups in social networks is an increasingly popular task in sociolinguistic studies. While methods to determine someone's influence in shortterm contexts (e.g., social media, on-line political debates) are widespread, influence in longterm contexts is less investigated and may be harder to capture. We study the diffusion of scientific terms in an English diachronic scientific corpus, applying Hawkes Processes to capture the role of individual scientists as "influencers" or "influencees" in the diffusion of new concepts. Our findings on two major scientific discoveries in chemistry and astronomy of the 18th century reveal that modelling both the introduction and diffusion of scientific terms in a historical corpus as Hawkes Processes allows detecting patterns of influence between authors on a long-term scale.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin; Krielke, Marie-Pauline; Degaetano-Ortlieb, Stefania

Structural complexity in scientific journal articles across time - from negative clausal expressions towards adjectival negative prefixes Inproceedings

Workshop on Complexity and Register (CAR21), Berlin, Germany, CRC1412 Register, 2021.

@inproceedings{Menzel-etal2021,
title = {Structural complexity in scientific journal articles across time - from negative clausal expressions towards adjectival negative prefixes},
author = {Katrin Menzel and Marie-Pauline Krielke and Stefania Degaetano-Ortlieb},
year = {2021},
date = {2021-11-19},
booktitle = {Workshop on Complexity and Register (CAR21)},
address = {Berlin, Germany, CRC1412 Register},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin

Scientific Eponyms throughout the History of English Scholarly Journal Articles Book Chapter

Van de Velde, Hans; Dolezal, Fredric T.;  (Ed.): Broadening Perspectives in the History of Dictionaries and Word Studies, Cambridge Scholars Publishing, pp. 159-193, Newcastle upon Tyne, 2021, ISBN 1-5275-7432-6.

@inbook{Menzel2021_eponyms,
title = {Scientific Eponyms throughout the History of English Scholarly Journal Articles},
author = {Katrin Menzel},
editor = {Hans Van de Velde and Fredric T. Dolezal},
url = {https://www.cambridgescholars.com/product/978-1-5275-7432-8},
year = {2021},
date = {2021-11-08},
booktitle = {Broadening Perspectives in the History of Dictionaries and Word Studies},
isbn = {1-5275-7432-6},
pages = {159-193},
publisher = {Cambridge Scholars Publishing},
address = {Newcastle upon Tyne},
pubstate = {published},
type = {inbook}
}

Copy BibTeX to Clipboard

Project:   B1

Degaetano-Ortlieb, Stefania; Säily, Tanja; Bizzoni, Yuri

Registerial Adaptation vs. Innovation Across Situational Contexts: 18th Century Women in Transition Journal Article

Frontiers in Artificial Intelligence, section Language and Computation, 4, 2021.

Endeavors to computationally model language variation and change are ever increasing. While analyses of recent diachronic trends are frequently conducted, long-term trends accounting for sociolinguistic variation are less well-studied. Our work sheds light on the temporal dynamics of language use of British 18th century women as a group in transition across two situational contexts. Our findings reveal that in formal contexts women adapt to register conventions, while in informal contexts they act as innovators of change in language use influencing others. While adopted from other disciplines, our methods inform (historical) sociolinguistic work in novel ways. These methods include diachronic periodization by Kullback-Leibler divergence to determine periods of change and relevant features of variation, and event cascades as influencer models.

@article{Degaetano-Ortlieb2021,
title = {Registerial Adaptation vs. Innovation Across Situational Contexts: 18th Century Women in Transition},
author = {Stefania Degaetano-Ortlieb and Tanja S{\"a}ily and Yuri Bizzoni},
url = {https://www.frontiersin.org/article/10.3389/frai.2021.609970},
doi = {https://doi.org/10.3389/frai.2021.609970},
year = {2021},
date = {2021},
journal = {Frontiers in Artificial Intelligence, section Language and Computation},
volume = {4},
abstract = {Endeavors to computationally model language variation and change are ever increasing. While analyses of recent diachronic trends are frequently conducted, long-term trends accounting for sociolinguistic variation are less well-studied. Our work sheds light on the temporal dynamics of language use of British 18th century women as a group in transition across two situational contexts. Our findings reveal that in formal contexts women adapt to register conventions, while in informal contexts they act as innovators of change in language use influencing others. While adopted from other disciplines, our methods inform (historical) sociolinguistic work in novel ways. These methods include diachronic periodization by Kullback-Leibler divergence to determine periods of change and relevant features of variation, and event cascades as influencer models.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Krielke, Marie-Pauline

Relativizers as markers of grammatical complexity: A diachronic, cross-register study of English and German Journal Article

Bergen Language and Linguistics Studies, 11, pp. 91-120, 2021.

In this paper, we investigate grammatical complexity as a register feature of scientific English and German. Specifically, we carry out a diachronic comparison between general and scientific discourse in the two languages from the 17th to the 19th century, using relativizers as proxies for grammatical complexity. We ground our study in register theory (Halliday and Hasan, 1985), assuming that language use reflects contextual factors, which contribute to the formation of registers (Quirk et al., 1985; Biber et al., 1999; Teich et al., 2016). Our findings show a clear tendency towards grammatical simplification in scientific discourse in both languages with English spearheading the trend early on and German following later.

@article{Krielke2021relativizers,
title = {Relativizers as markers of grammatical complexity: A diachronic, cross-register study of English and German},
author = {Marie-Pauline Krielke},
url = {https://doi.org/10.15845/bells.v11i1.3440},
doi = {https://doi.org/10.15845/bells.v11i1.3440},
year = {2021},
date = {2021-09-15},
journal = {Bergen Language and Linguistics Studies},
pages = {91-120},
volume = {11},
number = {1},
abstract = {In this paper, we investigate grammatical complexity as a register feature of scientific English and German. Specifically, we carry out a diachronic comparison between general and scientific discourse in the two languages from the 17th to the 19th century, using relativizers as proxies for grammatical complexity. We ground our study in register theory (Halliday and Hasan, 1985), assuming that language use reflects contextual factors, which contribute to the formation of registers (Quirk et al., 1985; Biber et al., 1999; Teich et al., 2016). Our findings show a clear tendency towards grammatical simplification in scientific discourse in both languages with English spearheading the trend early on and German following later.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Menzel, Katrin; Knappen, Jörg; Teich, Elke

Generating linguistically relevant metadata for the Royal Society Corpus Journal Article

Säily, Tanja; Tyrkkö, Jukka (Ed.): Research in Corpus Linguistics, Challenges in combining structured and unstructured data in corpus development (special issue), 9, pp. 1-18, 2021, ISSN 2243-4712.

This paper provides an overview of metadata generation and management for the Royal Society Corpus (RSC), aiming to encourage discussion about the specific challenges in building substantial diachronic corpora intended to be used for linguistic and humanistic analysis. We discuss the motivations and goals of building the corpus, describe its composition and present the types of metadata it contains. Specifically, we tackle two challenges: first, integration of original metadata from the data providers (JSTOR and the Royal Society); second, derivation of additional linguistically relevant metadata regarding text structure and situational context (register).

@article{Menzel2021,
title = {Generating linguistically relevant metadata for the Royal Society Corpus},
author = {Katrin Menzel and J{\"o}rg Knappen and Elke Teich},
editor = {Tanja S{\"a}ily and Jukka Tyrkk{\"o}},
url = {https://ricl.aelinco.es/index.php/ricl/article/view/158},
doi = {https://doi.org/10.32714/ricl.09.01.02},
year = {2021},
date = {2021},
journal = {Research in Corpus Linguistics, Challenges in combining structured and unstructured data in corpus development (special issue)},
pages = {1-18},
volume = {9},
number = {1},
abstract = {This paper provides an overview of metadata generation and management for the Royal Society Corpus (RSC), aiming to encourage discussion about the specific challenges in building substantial diachronic corpora intended to be used for linguistic and humanistic analysis. We discuss the motivations and goals of building the corpus, describe its composition and present the types of metadata it contains. Specifically, we tackle two challenges: first, integration of original metadata from the data providers (JSTOR and the Royal Society); second, derivation of additional linguistically relevant metadata regarding text structure and situational context (register).},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Teich, Elke; Fankhauser, Peter; Degaetano-Ortlieb, Stefania; Bizzoni, Yuri

Less is More/More Diverse: On The Communicative Utility of Linguistic Conventionalization Journal Article

Benîtez-Burraco, Antonio (Ed.): Frontiers in Communication, section Language Sciences, 2021.

We present empirical evidence of the communicative utility of CONVENTIONALIZATION, i.e., convergence in linguistic usage over time, and DIVERSIFICATION, i.e., linguistic items acquiring different, more specific usages/meanings. From a diachronic perspective, conventionalization plays a crucial role in language change as a condition for innovation and grammaticalization (Bybee, 2010; Schmid, 2015) and diversification is a cornerstone in the formation of sublanguages/registers, i.e., functional linguistic varieties (Halliday, 1988; Harris, 1991). While it is widely acknowledged that change in language use is primarily socio-culturally determined pushing towards greater linguistic expressivity, we here highlight the limiting function of communicative factors on diachronic linguistic variation showing that conventionalization and diversification are associated with a reduction of linguistic variability. To be able to observe effects of linguistic variability reduction, we first need a well-defined notion of choice in context. Linguistically, this implies the paradigmatic axis of linguistic organization, i.e., the sets of linguistic options available in a given or similar syntagmatic contexts. Here, we draw on word embeddings, weakly neural distributional language models that have recently been employed to model lexicalsemantic change and allow us to approximate the notion of paradigm by neighbourhood in vector space. Second, we need to capture changes in paradigmatic variability, i.e. reduction/expansion of linguistic options in a given context. As a formal index of paradigmatic variability we use entropy, which measures the contribution of linguistic units (e.g., words) in predicting linguistic choice in bits of information. Using entropy provides us with a link to a communicative interpretation, as it is a well-established measure of communicative efficiency with implications for cognitive processing (Linzen and Jaeger, 2016; Venhuizen et al., 2019); also, entropy is negatively correlated with distance in (word embedding) spaces which in turn shows cognitive reflexes in certain language processing tasks (Mitchel et al., 2008; Auguste et al., 2017). In terms of domain we focus on science, looking at the diachronic development of scientific English from the 17th century to modern time. This provides us with a fairly constrained yet dynamic domain of discourse that has witnessed a powerful systematization throughout the centuries and developed specific linguistic conventions geared towards efficient communication. Overall, our study confirms the assumed trends of conventionalization and diversification shown by diachronically decreasing entropy, interspersed with local, temporary entropy highs pointing to phases of linguistic expansion pertaining primarily to introduction of new technical terminology.

@article{Teich2021,
title = {Less is More/More Diverse: On The Communicative Utility of Linguistic Conventionalization},
author = {Elke Teich and Peter Fankhauser and Stefania Degaetano-Ortlieb and Yuri Bizzoni},
editor = {Antonio Benîtez-Burraco},
url = {https://www.frontiersin.org/articles/10.3389/fcomm.2020.620275/full?&utm_source=Email_to_authors_&utm_medium=Email&utm_content=T1_11.5e1_author&utm_campaign=Email_publication&field=&journalName=Frontiers_in_Communication&id=620275},
doi = {https://doi.org/10.3389/fcomm.2020.620275},
year = {2021},
date = {2021-01-26},
journal = {Frontiers in Communication, section Language Sciences},
abstract = {We present empirical evidence of the communicative utility of CONVENTIONALIZATION, i.e., convergence in linguistic usage over time, and DIVERSIFICATION, i.e., linguistic items acquiring different, more specific usages/meanings. From a diachronic perspective, conventionalization plays a crucial role in language change as a condition for innovation and grammaticalization (Bybee, 2010; Schmid, 2015) and diversification is a cornerstone in the formation of sublanguages/registers, i.e., functional linguistic varieties (Halliday, 1988; Harris, 1991). While it is widely acknowledged that change in language use is primarily socio-culturally determined pushing towards greater linguistic expressivity, we here highlight the limiting function of communicative factors on diachronic linguistic variation showing that conventionalization and diversification are associated with a reduction of linguistic variability. To be able to observe effects of linguistic variability reduction, we first need a well-defined notion of choice in context. Linguistically, this implies the paradigmatic axis of linguistic organization, i.e., the sets of linguistic options available in a given or similar syntagmatic contexts. Here, we draw on word embeddings, weakly neural distributional language models that have recently been employed to model lexicalsemantic change and allow us to approximate the notion of paradigm by neighbourhood in vector space. Second, we need to capture changes in paradigmatic variability, i.e. reduction/expansion of linguistic options in a given context. As a formal index of paradigmatic variability we use entropy, which measures the contribution of linguistic units (e.g., words) in predicting linguistic choice in bits of information. Using entropy provides us with a link to a communicative interpretation, as it is a well-established measure of communicative efficiency with implications for cognitive processing (Linzen and Jaeger, 2016; Venhuizen et al., 2019); also, entropy is negatively correlated with distance in (word embedding) spaces which in turn shows cognitive reflexes in certain language processing tasks (Mitchel et al., 2008; Auguste et al., 2017). In terms of domain we focus on science, looking at the diachronic development of scientific English from the 17th century to modern time. This provides us with a fairly constrained yet dynamic domain of discourse that has witnessed a powerful systematization throughout the centuries and developed specific linguistic conventions geared towards efficient communication. Overall, our study confirms the assumed trends of conventionalization and diversification shown by diachronically decreasing entropy, interspersed with local, temporary entropy highs pointing to phases of linguistic expansion pertaining primarily to introduction of new technical terminology.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Mosbach, Marius; Degaetano-Ortlieb, Stefania; Krielke, Marie-Pauline; Abdullah, Badr M.; Klakow, Dietrich

A Closer Look at Linguistic Knowledge in Masked Language Models: The Case of Relative Clauses in American English Inproceedings

Proceedings of the 28th International Conference on Computational Linguistics, pp. 771-787, 2020.

Transformer-based language models achieve high performance on various tasks, but we still lack understanding of the kind of linguistic knowledge they learn and rely on. We evaluate three models (BERT, RoBERTa, and ALBERT), testing their grammatical and semantic knowledge by sentence-level probing, diagnostic cases, and masked prediction tasks. We focus on relative clauses (in American English) as a complex phenomenon needing contextual information and antecedent identification to be resolved. Based on a naturalistic dataset, probing shows that all three models indeed capture linguistic knowledge about grammaticality, achieving high performance. Evaluation on diagnostic cases and masked prediction tasks considering fine-grained linguistic knowledge, however, shows pronounced model-specific weaknesses especially on semantic knowledge, strongly impacting models’ performance. Our results highlight the importance of (a) model comparison in evaluation task and (b) building up claims of model performance and the linguistic knowledge they capture beyond purely probing-based evaluations.

@inproceedings{Mosbach2020,
title = {A Closer Look at Linguistic Knowledge in Masked Language Models: The Case of Relative Clauses in American English},
author = {Marius Mosbach and Stefania Degaetano-Ortlieb and Marie-Pauline Krielke and Badr M. Abdullah and Dietrich Klakow},
url = {https://aclanthology.org/2020.coling-main.67/},
year = {2020},
date = {2020},
booktitle = {Proceedings of the 28th International Conference on Computational Linguistics},
pages = {771-787},
abstract = {Transformer-based language models achieve high performance on various tasks, but we still lack understanding of the kind of linguistic knowledge they learn and rely on. We evaluate three models (BERT, RoBERTa, and ALBERT), testing their grammatical and semantic knowledge by sentence-level probing, diagnostic cases, and masked prediction tasks. We focus on relative clauses (in American English) as a complex phenomenon needing contextual information and antecedent identification to be resolved. Based on a naturalistic dataset, probing shows that all three models indeed capture linguistic knowledge about grammaticality, achieving high performance. Evaluation on diagnostic cases and masked prediction tasks considering fine-grained linguistic knowledge, however, shows pronounced model-specific weaknesses especially on semantic knowledge, strongly impacting models’ performance. Our results highlight the importance of (a) model comparison in evaluation task and (b) building up claims of model performance and the linguistic knowledge they capture beyond purely probing-based evaluations.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   B1 B4 C4

Juzek, Tom; Krielke, Marie-Pauline; Teich, Elke

Exploring diachronic syntactic shifts with dependency length: the case of scientific English Inproceedings

Proceedings of the Fourth Workshop on Universal Dependencies (UDW 2020), Association for Computational Linguistics, pp. 109-119, Barcelona, Spain (Online), 2020.

We report on an application of universal dependencies for the study of diachronic shifts in syntactic usage patterns. Our focus is on the evolution of Scientific English in the Late Modern English period (ca. 1700-1900). Our data set is the Royal Society Corpus (RSC), comprising the full set of publications of the Royal Society of London between 1665 and 1996. Our starting assumption is that over time, Scientific English develops specific syntactic choice preferences that increase efficiency in (expert-to-expert) communication. The specific hypothesis we pursue in this paper is that changing syntactic choice preferences lead to greater dependency locality/dependency length minimization, which is associated with positive effects for the efficiency of human as well as computational linguistic processing. As a basis for our measurements, we parsed the RSC using Stanford CoreNLP. Overall, we observe a decrease in dependency length, with long dependency structures becoming less frequent and short dependency structures becoming more frequent over time, notably pertaining to the nominal phrase, thus marking an overall push towards greater communicative efficiency.

@inproceedings{juzek-etal-2020-exploring,
title = {Exploring diachronic syntactic shifts with dependency length: the case of scientific English},
author = {Tom Juzek and Marie-Pauline Krielke and Elke Teich},
url = {https://www.aclweb.org/anthology/2020.udw-1.13},
year = {2020},
date = {2020},
booktitle = {Proceedings of the Fourth Workshop on Universal Dependencies (UDW 2020)},
pages = {109-119},
publisher = {Association for Computational Linguistics},
address = {Barcelona, Spain (Online)},
abstract = {We report on an application of universal dependencies for the study of diachronic shifts in syntactic usage patterns. Our focus is on the evolution of Scientific English in the Late Modern English period (ca. 1700-1900). Our data set is the Royal Society Corpus (RSC), comprising the full set of publications of the Royal Society of London between 1665 and 1996. Our starting assumption is that over time, Scientific English develops specific syntactic choice preferences that increase efficiency in (expert-to-expert) communication. The specific hypothesis we pursue in this paper is that changing syntactic choice preferences lead to greater dependency locality/dependency length minimization, which is associated with positive effects for the efficiency of human as well as computational linguistic processing. As a basis for our measurements, we parsed the RSC using Stanford CoreNLP. Overall, we observe a decrease in dependency length, with long dependency structures becoming less frequent and short dependency structures becoming more frequent over time, notably pertaining to the nominal phrase, thus marking an overall push towards greater communicative efficiency.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Teich, Elke

Language variation and change: A communicative perspective Miscellaneous

Jahrestagung der Deutschen Gesellschaft für Sprachwissenschaft, DGfS 2020, Hamburg, 2020.

It is widely acknowledged that language use and language structure are closely interlinked, linguistic structure emerging from language use (Bybee & Hopper 2001). Language use, in turn, is characterized by variation; in fact, speakers’ ability to adapt to changing contexts is a prerequisite for language to be functional (Weinreich et al. 1968).

Taking the perspective of rational communication, in my talk I will revisit some core questions of diachronic linguistic change: Why does a change happen? Which features are involved in change? How does change proceed? What are the eff ects of change? Recent work on online human language use reveals that speakers try to optimize their linguistic productions by encoding their messages with uniform information density (see Crocker et al. 2016 for an overview). Here, a major determinant in linguistic choice is predictability in context. Predictability in context is commonly represented by information content measured in bits (Shannon information): The more predictable a linguistic unit (e.g. word) is in a given context, the fewer bits are needed for encoding and the shorter its linguistic encoding may be (and vice versa, the more “surprising” a unit is in a given context, the more bits are needed for encoding and the more explicit its encoding tends to be). In this view, one major function of linguistic variation is to modulate information content so as to optimize message transmission.

In my talk, I apply this perspective to diachronic linguistic change. I show that speakers’ continuous adaptation to changing contextual conditions pushes towards linguistic innovation and results in temporary, high levels of expressivity, but the concern for maintaining communicative function pulls towards convergence and results in conventionalization. The diachronic scenario I discuss is mid-term change (200–250 years) in English in the late Modern period, focusing on the discourse domain of science (Degaetano-Ortlieb & Teich 2019). In terms of methods, I use computational language models to estimate predictability in context; and to assess diachronic change, I apply selected measures of information content, including entropy and surprisal.

@miscellaneous{Teich2020a,
title = {Language variation and change: A communicative perspective},
author = {Elke Teich},
url = {https://www.zfs.uni-hamburg.de/en/dgfs2020/programm/keynotes/elke-teich.html},
year = {2020},
date = {2020-11-04},
booktitle = {Jahrestagung der Deutschen Gesellschaft f{\"u}r Sprachwissenschaft, DGfS 2020},
address = {Hamburg},
abstract = {It is widely acknowledged that language use and language structure are closely interlinked, linguistic structure emerging from language use (Bybee & Hopper 2001). Language use, in turn, is characterized by variation; in fact, speakers’ ability to adapt to changing contexts is a prerequisite for language to be functional (Weinreich et al. 1968). Taking the perspective of rational communication, in my talk I will revisit some core questions of diachronic linguistic change: Why does a change happen? Which features are involved in change? How does change proceed? What are the eff ects of change? Recent work on online human language use reveals that speakers try to optimize their linguistic productions by encoding their messages with uniform information density (see Crocker et al. 2016 for an overview). Here, a major determinant in linguistic choice is predictability in context. Predictability in context is commonly represented by information content measured in bits (Shannon information): The more predictable a linguistic unit (e.g. word) is in a given context, the fewer bits are needed for encoding and the shorter its linguistic encoding may be (and vice versa, the more “surprising” a unit is in a given context, the more bits are needed for encoding and the more explicit its encoding tends to be). In this view, one major function of linguistic variation is to modulate information content so as to optimize message transmission. In my talk, I apply this perspective to diachronic linguistic change. I show that speakers’ continuous adaptation to changing contextual conditions pushes towards linguistic innovation and results in temporary, high levels of expressivity, but the concern for maintaining communicative function pulls towards convergence and results in conventionalization. The diachronic scenario I discuss is mid-term change (200–250 years) in English in the late Modern period, focusing on the discourse domain of science (Degaetano-Ortlieb & Teich 2019). In terms of methods, I use computational language models to estimate predictability in context; and to assess diachronic change, I apply selected measures of information content, including entropy and surprisal.},
note = {Key note},
pubstate = {published},
type = {miscellaneous}
}

Copy BibTeX to Clipboard

Project:   B1

Fischer, Stefan; Knappen, Jörg; Menzel, Katrin; Teich, Elke

The Royal Society Corpus 6.0: Providing 300+ Years of Scientific Writing for Humanistic Study Inproceedings

Proceedings of the 12th Language Resources and Evaluation Conference, European Language Resources Association, pp. 794-802, Marseille, France, 2020.

We present a new, extended version of the Royal Society Corpus (RSC), a diachronic corpus of scientific English now covering 300+ years of scientific writing (1665–1996). The corpus comprises 47 837 texts, primarily scientific articles, and is based on publications of the Royal Society of London, mainly its Philosophical Transactions and Proceedings.

The corpus has been built on the basis of the FAIR principles and is freely available under a Creative Commons license, excluding copy-righted parts. We provide information on how the corpus can be found, the file formats available for download as well as accessibility via a web-based corpus query platform. We show a number of analytic tools that we have implemented for better usability and provide an example of use of the corpus for linguistic analysis as well as examples of subsequent, external uses of earlier releases.

We place the RSC against the background of existing English diachronic/scientific corpora, elaborating on its value for linguistic and humanistic study.

@inproceedings{fischer-EtAl:2020:LREC,
title = {The Royal Society Corpus 6.0: Providing 300+ Years of Scientific Writing for Humanistic Study},
author = {Stefan Fischer and J{\"o}rg Knappen and Katrin Menzel and Elke Teich},
url = {https://www.aclweb.org/anthology/2020.lrec-1.99/},
year = {2020},
date = {2020},
booktitle = {Proceedings of the 12th Language Resources and Evaluation Conference},
pages = {794-802},
publisher = {European Language Resources Association},
address = {Marseille, France},
abstract = {We present a new, extended version of the Royal Society Corpus (RSC), a diachronic corpus of scientific English now covering 300+ years of scientific writing (1665–1996). The corpus comprises 47 837 texts, primarily scientific articles, and is based on publications of the Royal Society of London, mainly its Philosophical Transactions and Proceedings. The corpus has been built on the basis of the FAIR principles and is freely available under a Creative Commons license, excluding copy-righted parts. We provide information on how the corpus can be found, the file formats available for download as well as accessibility via a web-based corpus query platform. We show a number of analytic tools that we have implemented for better usability and provide an example of use of the corpus for linguistic analysis as well as examples of subsequent, external uses of earlier releases. We place the RSC against the background of existing English diachronic/scientific corpora, elaborating on its value for linguistic and humanistic study.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B1

Bizzoni, Yuri; Degaetano-Ortlieb, Stefania; Fankhauser, Peter; Teich, Elke

Linguistic Variation and Change in 250 years of English Scientific Writing: A Data-driven Approach Journal Article

Jurgens, David (Ed.): Frontiers in Artificial Intelligence, section Language and Computation, 2020.

We trace the evolution of Scientific English through the Late Modern period to modern time on the basis of a comprehensive corpus composed of the Transactions and Proceedings of the Royal Society of London, the first and longest-running English scientific journal established in 1665.

Specifically, we explore the linguistic imprints of specialization and diversification in the science domain which accumulate in the formation of “scientific language” and field-specific sublanguages/registers (chemistry, biology etc.). We pursue an exploratory, data-driven approach using state-of-the-art computational language models and combine them with selected information-theoretic measures (entropy, relative entropy) for comparing models along relevant dimensions of variation (time, register).

Focusing on selected linguistic variables (lexis, grammar), we show how we deploy computational language models for capturing linguistic variation and change and discuss benefits and limitations.

@article{Bizzoni2020b,
title = {Linguistic Variation and Change in 250 years of English Scientific Writing: A Data-driven Approach},
author = {Yuri Bizzoni and Stefania Degaetano-Ortlieb and Peter Fankhauser and Elke Teich},
editor = {David Jurgens},
url = {https://www.frontiersin.org/articles/10.3389/frai.2020.00073/full},
doi = {https://doi.org/https://doi.org/10.3389/frai.2020.00073},
year = {2020},
date = {2020-10-18},
journal = {Frontiers in Artificial Intelligence, section Language and Computation},
abstract = {We trace the evolution of Scientific English through the Late Modern period to modern time on the basis of a comprehensive corpus composed of the Transactions and Proceedings of the Royal Society of London, the first and longest-running English scientific journal established in 1665. Specifically, we explore the linguistic imprints of specialization and diversification in the science domain which accumulate in the formation of “scientific language” and field-specific sublanguages/registers (chemistry, biology etc.). We pursue an exploratory, data-driven approach using state-of-the-art computational language models and combine them with selected information-theoretic measures (entropy, relative entropy) for comparing models along relevant dimensions of variation (time, register). Focusing on selected linguistic variables (lexis, grammar), we show how we deploy computational language models for capturing linguistic variation and change and discuss benefits and limitations.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   B1

Juzek, Tom; Fischer, Stefan; Krielke, Marie-Pauline; Degaetano-Ortlieb, Stefania; Teich, Elke

Challenges of parsing a historical corpus of Scientific English Miscellaneous

Historical Corpora and Variation (Book of Abstracts), Cagliari, Italy, 2019.

In this contribution, we outline our experiences with syntactically parsing a diachronic historical corpus. We report on how errors like OCR inaccuracies, end-of-sentence inaccuracies, etc. propagate bottom-up and how we approach such errors by building on existing machine learning approaches for error correction. The Royal Society Corpus (RSC; Kermes et al. 2016) is a collection of scientific text from 1665 to 1869 and contains ca. 10 000 documents and 30 million tokens. Using the RSC, we wish to describe and
model how syntactic complexity changes as Scientific English of the late modern period develops. Our focus is on how common measures of syntactic complexity, e.g. length in tokens, embedding depth, and number of dependants, relate to estimates of information content. Our hypothesis is that Scientific English develops towards the use of shorter sentences with fewer clausal embeddings and increasingly complex noun phrases over time, in order to accommodate an expansion on the lexical level.

@miscellaneous{Juzek2019a,
title = {Challenges of parsing a historical corpus of Scientific English},
author = {Tom Juzek and Stefan Fischer and Marie-Pauline Krielke and Stefania Degaetano-Ortlieb and Elke Teich},
url = {https://convegni.unica.it/hicov/files/2019/01/Juzek-et-al.pdf},
year = {2019},
date = {2019},
booktitle = {Historical Corpora and Variation (Book of Abstracts)},
address = {Cagliari, Italy},
abstract = {In this contribution, we outline our experiences with syntactically parsing a diachronic historical corpus. We report on how errors like OCR inaccuracies, end-of-sentence inaccuracies, etc. propagate bottom-up and how we approach such errors by building on existing machine learning approaches for error correction. The Royal Society Corpus (RSC; Kermes et al. 2016) is a collection of scientific text from 1665 to 1869 and contains ca. 10 000 documents and 30 million tokens. Using the RSC, we wish to describe and model how syntactic complexity changes as Scientific English of the late modern period develops. Our focus is on how common measures of syntactic complexity, e.g. length in tokens, embedding depth, and number of dependants, relate to estimates of information content. Our hypothesis is that Scientific English develops towards the use of shorter sentences with fewer clausal embeddings and increasingly complex noun phrases over time, in order to accommodate an expansion on the lexical level.},
pubstate = {published},
type = {miscellaneous}
}

Copy BibTeX to Clipboard

Project:   B1

Juzek, Tom; Fischer, Stefan; Krielke, Marie-Pauline; Degaetano-Ortlieb, Stefania; Teich, Elke

Annotation quality assessment and error correction in diachronic corpora: Combining pattern-based and machine learning approaches Miscellaneous

52nd Annual Meeting of the Societas Linguistica Europaea (Book of Abstracts), 2019.

@miscellaneous{Juzek2019,
title = {Annotation quality assessment and error correction in diachronic corpora: Combining pattern-based and machine learning approaches},
author = {Tom Juzek and Stefan Fischer and Marie-Pauline Krielke and Stefania Degaetano-Ortlieb and Elke Teich},
year = {2019},
date = {2019},
booktitle = {52nd Annual Meeting of the Societas Linguistica Europaea (Book of Abstracts)},
pubstate = {published},
type = {miscellaneous}
}

Copy BibTeX to Clipboard

Project:   B1

Successfully