Publications

Zaitova, Iuliia; Stenger, Irina; Avgustinova, Tania

Microsyntactic Unit Detection Using Word Embedding Models: Experiments on Slavic Languages Inproceedings

Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2023), pp. 1251-1259, 2023.

@inproceedings{Zaitova/etal:2023a,
title = {Microsyntactic Unit Detection Using Word Embedding Models: Experiments on Slavic Languages},
author = {Iuliia Zaitova and Irina Stenger and Tania Avgustinova},
year = {2023},
date = {2023},
booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2023)},
pages = {1251-1259},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Kudera, Jacek; Stenger, Irina; Georgis, Philip; Möbius, Bernd; Avgustinova, Tania; Klakow, Dietrich

Cross-linguistic intelligibility of idiomatic phrases in Polish-Russian translation tasks Incollection

Phraseology, constructions and translation: Corpus-based, computational and cultural aspects, Presses universitaires de Louvain, pp. 237-249, 2023.

This paper presents the results of a translation task involving idiomatic phrases in closely related languages. The goal is to test auditory comprehension of idioms. The experiment was conducted with native speakers of either Polish or Russian, who were not professional translators. The translation equivalents were categorized according to three conditions: (1) semantic equivalent, found in a phraseological dictionary; (2) lemma-based referent, sharing a cognate component; and (3) literal translation of the source phrase. It is hypothesized that information-theoretic measures of surprisal in combination with lexical and syntactic distances between idioms can predict lay translators’ preferences. The results suggest that the proposed measures are valid predictors for the type of translation native speakers will select. The outcomes reveal an asymmetry in preference for equivalent selection across the groups of lay translators.

@incollection{Kudera/etal:2023a,
title = {Cross-linguistic intelligibility of idiomatic phrases in Polish-Russian translation tasks},
author = {Jacek Kudera and Irina Stenger and Philip Georgis and Bernd M{\"o}bius and Tania Avgustinova and Dietrich Klakow},
url = {https://pul.uclouvain.be/book/?GCOI=29303100163350&utm_source=rss&utm_medium=rss&utm_campaign=newreleases#h2tabFormats},
year = {2023},
date = {2023},
booktitle = {Phraseology, constructions and translation: Corpus-based, computational and cultural aspects},
pages = {237-249},
publisher = {Presses universitaires de Louvain},
abstract = {This paper presents the results of a translation task involving idiomatic phrases in closely related languages. The goal is to test auditory comprehension of idioms. The experiment was conducted with native speakers of either Polish or Russian, who were not professional translators. The translation equivalents were categorized according to three conditions: (1) semantic equivalent, found in a phraseological dictionary; (2) lemma-based referent, sharing a cognate component; and (3) literal translation of the source phrase. It is hypothesized that information-theoretic measures of surprisal in combination with lexical and syntactic distances between idioms can predict lay translators’ preferences. The results suggest that the proposed measures are valid predictors for the type of translation native speakers will select. The outcomes reveal an asymmetry in preference for equivalent selection across the groups of lay translators.},
pubstate = {published},
type = {incollection}
}

Copy BibTeX to Clipboard

Project:   C4

Abdullah, Badr M.; Shaik, Mohammed Maqsood ; Möbius, Bernd; Klakow, Dietrich

An information-theoretic analysis of self-supervised discrete representations of speech Inproceedings

Proceedings of Interspeech 2023, pp. 2883-2887, Dublin, Ireland, 2023.

Self-supervised representation learning for speech often involves a quantization step that transforms the acoustic input into discrete units. However, it remains unclear how to characterize the relationship between these discrete units and abstract phonetic categories such as phonemes. In this paper, we develop an information-theoretic framework whereby we represent each phonetic category as a distribution over discrete units. We then apply our framework to two different self-supervised models (namely wav2vec 2.0 and XLSR) and use American English speech as a case study. Our study demonstrates that the entropy of phonetic distributions reflects the variability of the underlying speech sounds, with phonetically similar sounds exhibiting similar distributions. While our study confirms the lack of direct, one-to-one correspondence, we find an intriguing, indirect relationship between phonetic categories and discrete units.

@inproceedings{Abdullah/etal:2023a,
title = {An information-theoretic analysis of self-supervised discrete representations of speech},
author = {Badr M. Abdullah and Mohammed Maqsood Shaik and Bernd M{\"o}bius and Dietrich Klakow},
doi = {https://doi.org/10.21437/Interspeech.2023--2131},
year = {2023},
date = {2023},
booktitle = {Proceedings of Interspeech 2023},
pages = {2883-2887},
address = {Dublin, Ireland},
abstract = {Self-supervised representation learning for speech often involves a quantization step that transforms the acoustic input into discrete units. However, it remains unclear how to characterize the relationship between these discrete units and abstract phonetic categories such as phonemes. In this paper, we develop an information-theoretic framework whereby we represent each phonetic category as a distribution over discrete units. We then apply our framework to two different self-supervised models (namely wav2vec 2.0 and XLSR) and use American English speech as a case study. Our study demonstrates that the entropy of phonetic distributions reflects the variability of the underlying speech sounds, with phonetically similar sounds exhibiting similar distributions. While our study confirms the lack of direct, one-to-one correspondence, we find an intriguing, indirect relationship between phonetic categories and discrete units.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Abdullah, Badr M.; Shaik, Mohammed Maqsood ; Klakow, Dietrich

On the Nature of Discrete Speech Representations in Multilingual Self-supervised Models Inproceedings

Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP, Association for Computational Linguistics, pp. 159-161, Dubrovnik, Croatia, 2023.

Self-supervision has emerged as an effective paradigm for learning representations of spoken language from raw audio without explicit labels or transcriptions. Self-supervised speech models, such as wav2vec 2.0 (Baevski et al., 2020) and HuBERT (Hsu et al., 2021), have shown significant promise in improving the performance across different speech processing tasks. One of the main advantages of self-supervised speech models is that they can be pre-trained on a large sample of languages (Conneau et al., 2020; Babu et al.,2022), which facilitates cross-lingual transfer for low-resource languages (San et al., 2021). State-of-the-art self-supervised speech models include a quantization module that transforms the continuous acoustic input into a sequence of discrete units. One of the key questions in this area is whether the discrete representations learned via self-supervision are language-specific or language-universal. In other words, we ask: do the discrete units learned by a multilingual speech model represent the same speech sounds across languages or do they differ based on the specific language being spoken? From the practical perspective, this question has important implications for the development of speech models that can generalize across languages, particularly for low-resource languages. Furthermore, examining the level of linguistic abstraction in speech models that lack symbolic supervision is also relevant to the field of human language acquisition (Dupoux, 2018).

@inproceedings{abdullah-etal-2023-nature,
title = {On the Nature of Discrete Speech Representations in Multilingual Self-supervised Models},
author = {Badr M. Abdullah and Mohammed Maqsood Shaik and Dietrich Klakow},
url = {https://aclanthology.org/2023.sigtyp-1.20},
year = {2023},
date = {2023},
booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP},
pages = {159-161},
publisher = {Association for Computational Linguistics},
address = {Dubrovnik, Croatia},
abstract = {Self-supervision has emerged as an effective paradigm for learning representations of spoken language from raw audio without explicit labels or transcriptions. Self-supervised speech models, such as wav2vec 2.0 (Baevski et al., 2020) and HuBERT (Hsu et al., 2021), have shown significant promise in improving the performance across different speech processing tasks. One of the main advantages of self-supervised speech models is that they can be pre-trained on a large sample of languages (Conneau et al., 2020; Babu et al.,2022), which facilitates cross-lingual transfer for low-resource languages (San et al., 2021). State-of-the-art self-supervised speech models include a quantization module that transforms the continuous acoustic input into a sequence of discrete units. One of the key questions in this area is whether the discrete representations learned via self-supervision are language-specific or language-universal. In other words, we ask: do the discrete units learned by a multilingual speech model represent the same speech sounds across languages or do they differ based on the specific language being spoken? From the practical perspective, this question has important implications for the development of speech models that can generalize across languages, particularly for low-resource languages. Furthermore, examining the level of linguistic abstraction in speech models that lack symbolic supervision is also relevant to the field of human language acquisition (Dupoux, 2018).},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Steuer, Julius; Abdullah, Badr M.; List, Johann-Mattis; Klakow, Dietrich

Information-Theoretic Characterization of Vowel Harmony: A Cross-Linguistic Study on Word Lists Inproceedings

Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP, Association for Computational Linguistics, pp. 96-109, Dubrovnik, Croatia, 2023.

We present a cross-linguistic study of vowel harmony that aims to quantifies this phenomenon using data-driven computational modeling. Concretely, we define an information-theoretic measure of harmonicity based on the predictability of vowels in a natural language lexicon, which we estimate using phoneme-level language models (PLMs). Prior quantitative studies have heavily relied on inflected word-forms in the analysis on vowel harmony. On the contrary, we train our models using cross-linguistically comparable lemma forms with little or no inflection, which enables us to cover more under-studied languages. Training data for our PLMs consists of word lists offering a maximum of 1000 entries per language. Despite the fact that the data we employ are substantially smaller than previously used corpora, our experiments demonstrate the neural PLMs capture vowel harmony patterns in a set of languages that exhibit this phenomenon. Our work also demonstrates that word lists are a valuable resource for typological research, and offers new possibilities for future studies on low-resource, under-studied languages.

@inproceedings{steuer-etal-2023-information,
title = {Information-Theoretic Characterization of Vowel Harmony: A Cross-Linguistic Study on Word Lists},
author = {Julius Steuer and Badr M. Abdullah and Johann-Mattis List and Dietrich Klakow},
url = {https://aclanthology.org/2023.sigtyp-1.10},
year = {2023},
date = {2023},
booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP},
pages = {96-109},
publisher = {Association for Computational Linguistics},
address = {Dubrovnik, Croatia},
abstract = {We present a cross-linguistic study of vowel harmony that aims to quantifies this phenomenon using data-driven computational modeling. Concretely, we define an information-theoretic measure of harmonicity based on the predictability of vowels in a natural language lexicon, which we estimate using phoneme-level language models (PLMs). Prior quantitative studies have heavily relied on inflected word-forms in the analysis on vowel harmony. On the contrary, we train our models using cross-linguistically comparable lemma forms with little or no inflection, which enables us to cover more under-studied languages. Training data for our PLMs consists of word lists offering a maximum of 1000 entries per language. Despite the fact that the data we employ are substantially smaller than previously used corpora, our experiments demonstrate the neural PLMs capture vowel harmony patterns in a set of languages that exhibit this phenomenon. Our work also demonstrates that word lists are a valuable resource for typological research, and offers new possibilities for future studies on low-resource, under-studied languages.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   B4 C4

Kudera, Jacek

Slavic receptive multilingualism: intercomprehension of speech PhD Thesis

Saarland University, Saarbruecken, Germany, 2022.

Intercomprehension refers to a communication practice in which speakers use closely related languages. We know that the degree of mutual intelligibility differs according to the stimulus modality. This work aims to define the linguistic features which contribute to and impede cross-lingual understanding of speech via production and perception studies involving speakers of four Slavic languages. The current study combines the methodological apparatus from acoustic phonetics and information theory to provide evidence for mutual intelligibility on various levels of language processing. It concludes that the degree of mutual understanding does not always correspond to typological divisions of tested languages. The results presented here suggest that intercomprehension is often driven by unit (un)expectedness rather than the phonetic resemblance of a perceived stimulus and its equivalence in the native lexicon of speakers.

@phdthesis{Kudera_Diss_2022,
title = {Slavic receptive multilingualism: intercomprehension of speech},
author = {Jacek Kudera},
url = {https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/33236},
doi = {https://doi.org/10.22028/D291-36578},
year = {2022},
date = {2022},
school = {Saarland University},
address = {Saarbruecken, Germany},
abstract = {Intercomprehension refers to a communication practice in which speakers use closely related languages. We know that the degree of mutual intelligibility differs according to the stimulus modality. This work aims to define the linguistic features which contribute to and impede cross-lingual understanding of speech via production and perception studies involving speakers of four Slavic languages. The current study combines the methodological apparatus from acoustic phonetics and information theory to provide evidence for mutual intelligibility on various levels of language processing. It concludes that the degree of mutual understanding does not always correspond to typological divisions of tested languages. The results presented here suggest that intercomprehension is often driven by unit (un)expectedness rather than the phonetic resemblance of a perceived stimulus and its equivalence in the native lexicon of speakers.},
pubstate = {published},
type = {phdthesis}
}

Copy BibTeX to Clipboard

Project:   C4

Kudera, Jacek; Stenger, Irina; Möbius, Bernd; Avgustinova, Tania; Klakow, Dietrich

Phonetic cues in auditory identification of Bulgarian, Czech, Polish, and Russian language of origin Journal Article

Language and Speech, 2022.

This work presents the results of an auditory language of origin identification experiment. Disyllabic and trisyllabic logatomes were recorded by speakers of Bulgarian, Czech, Polish, and Russian, and presented to L1 speakers of the abovementioned Slavic languages. The goals of the test were to verify the ability of lay listeners to recognize the linguistic origin of speakers, based on spoken samples with limited segmental and suprasegmental information, and to correlate the signal features with the subjects’ performance. It was found that position of word stress is not an important predictor in language recognition. However, inherent vowel characteristics such as duration and vowel space computed by the means of Pillai scores correlate with subjects’ performance. Both the linguistic profile and the familiarity with closely related languages also appear to be relevant predictors of listeners’ performance. Finally, the information-theoretic notion of surprisal applied on regular cross-linguistic sound correspondences was correlated with recognition scores; though, the correlations did not reach the threshold of statistical significance. We conclude that auditory identification of linguistic origin by lay persons, native speakers of closely related languages, is possible even when exposed to limited segmental information, which can serve as a cue in the identification of linguistic origin.

@article{kudera_etal2022_cues,
title = {Phonetic cues in auditory identification of Bulgarian, Czech, Polish, and Russian language of origin},
author = {Jacek Kudera and Irina Stenger and Bernd M{\"o}bius and Tania Avgustinova and Dietrich Klakow},
url = {https://journals.sagepub.com/eprint/JJIKHP9RPEYZM2EQKFWZ/full},
doi = {https://doi.org/10.1177/00238309221119098},
year = {2022},
date = {2022-09-01},
journal = {Language and Speech},
abstract = {This work presents the results of an auditory language of origin identification experiment. Disyllabic and trisyllabic logatomes were recorded by speakers of Bulgarian, Czech, Polish, and Russian, and presented to L1 speakers of the abovementioned Slavic languages. The goals of the test were to verify the ability of lay listeners to recognize the linguistic origin of speakers, based on spoken samples with limited segmental and suprasegmental information, and to correlate the signal features with the subjects’ performance. It was found that position of word stress is not an important predictor in language recognition. However, inherent vowel characteristics such as duration and vowel space computed by the means of Pillai scores correlate with subjects’ performance. Both the linguistic profile and the familiarity with closely related languages also appear to be relevant predictors of listeners’ performance. Finally, the information-theoretic notion of surprisal applied on regular cross-linguistic sound correspondences was correlated with recognition scores; though, the correlations did not reach the threshold of statistical significance. We conclude that auditory identification of linguistic origin by lay persons, native speakers of closely related languages, is possible even when exposed to limited segmental information, which can serve as a cue in the identification of linguistic origin.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Project:   C4

Abdullah, Badr M.; Klakow, Dietrich

Analyzing the Representational Geometry of Acoustic Word Embeddings Inproceedings

Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP, Association for Computational Linguistics, pp. 178-191, Abu Dhabi, United Arab Emirates (Hybrid), 2022.

Acoustic word embeddings (AWEs) are fixed-dimensionality vector representations in a vector space such that different acoustic exemplars of the same word are projected nearby in the embedding space. In addition to their use in speech technology applications such as spoken term discovery and keyword spotting, AWE models have been adopted as models of spoken-word processing in several cognitively motivated studies and they have shown to exhibit a human-like performance in some auditory processing tasks. Nevertheless, the representation geometry of AWEs remains an under-explored topic that has not been studied in the literature. In this paper, we take a closer analytical look at AWEs and study how the choice of the learning objective and the architecture shapes their representational profile. Our main findings highlight the prominent role of the learning objective on the representational geometry over the architecture.

@inproceedings{abdullah-klakow-2022-analyzing,
title = {Analyzing the Representational Geometry of Acoustic Word Embeddings},
author = {Badr M. Abdullah and Dietrich Klakow},
url = {https://aclanthology.org/2022.blackboxnlp-1.15},
doi = {https://doi.org/10.18653/v1/2022.blackboxnlp-1.15},
year = {2022},
date = {2022},
booktitle = {Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP},
pages = {178-191},
publisher = {Association for Computational Linguistics},
address = {Abu Dhabi, United Arab Emirates (Hybrid)},
abstract = {Acoustic word embeddings (AWEs) are fixed-dimensionality vector representations in a vector space such that different acoustic exemplars of the same word are projected nearby in the embedding space. In addition to their use in speech technology applications such as spoken term discovery and keyword spotting, AWE models have been adopted as models of spoken-word processing in several cognitively motivated studies and they have shown to exhibit a human-like performance in some auditory processing tasks. Nevertheless, the representation geometry of AWEs remains an under-explored topic that has not been studied in the literature. In this paper, we take a closer analytical look at AWEs and study how the choice of the learning objective and the architecture shapes their representational profile. Our main findings highlight the prominent role of the learning objective on the representational geometry over the architecture.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Stenger, Irina; Georgis, Philip; Avgustinova, Tania; Möbius, Bernd; Klakow, Dietrich

Modeling the Impact of Syntactic Distance and Surprisal on Cross-Slavic Text Comprehension Inproceedings

Proceedings of the Language Resources and Evaluation Conference, European Language Resources Association, pp. 7368-7376, Marseille, France, 2022.

We focus on the syntactic variation and measure syntactic distances between nine Slavic languages (Belarusian, Bulgarian, Croatian, Czech, Polish, Slovak, Slovene, Russian, and Ukrainian) using symmetric measures of insertion, deletion and movement of syntactic units in the parallel sentences of the fable „The North Wind and the Sun“. Additionally, we investigate phonetic and orthographic asymmetries between selected languages by means of the information theoretical notion of surprisal. Syntactic distance and surprisal are, thus, considered as potential predictors of mutual intelligibility between related languages. In spoken and written cloze test experiments for Slavic native speakers, the presented predictors will be validated as to whether variations in syntax lead to a slower or impeded intercomprehension of Slavic texts.

@inproceedings{stenger-EtAl:2022:LREC,
title = {Modeling the Impact of Syntactic Distance and Surprisal on Cross-Slavic Text Comprehension},
author = {Irina Stenger and Philip Georgis and Tania Avgustinova and Bernd M{\"o}bius and Dietrich Klakow},
url = {https://aclanthology.org/2022.lrec-1.802},
year = {2022},
date = {2022-06-21},
booktitle = {Proceedings of the Language Resources and Evaluation Conference},
pages = {7368-7376},
publisher = {European Language Resources Association},
address = {Marseille, France},
abstract = {We focus on the syntactic variation and measure syntactic distances between nine Slavic languages (Belarusian, Bulgarian, Croatian, Czech, Polish, Slovak, Slovene, Russian, and Ukrainian) using symmetric measures of insertion, deletion and movement of syntactic units in the parallel sentences of the fable "The North Wind and the Sun". Additionally, we investigate phonetic and orthographic asymmetries between selected languages by means of the information theoretical notion of surprisal. Syntactic distance and surprisal are, thus, considered as potential predictors of mutual intelligibility between related languages. In spoken and written cloze test experiments for Slavic native speakers, the presented predictors will be validated as to whether variations in syntax lead to a slower or impeded intercomprehension of Slavic texts.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Kudera, Jacek; Georgis, Philip; Alam, Hasan Md Tusfiqur; Möbius, Bernd; Avgustinova, Tania; Klakow, Dietrich

Comprehension of closely related languages: A visual world eye tracking study Inproceedings

Elektronische Sprachsignalverarbeitung 2022, Tagungsband der 33. Konferenz (Sønderborg), pp. 212-219, 2022.

We present results of an eye tracking experiment which aimed at testing sentence comprehension in closely related Slavic languages. Since none of the participants were trained in translation studies or Slavic linguistics, the study illustrates effects of intercomprehension. The participants were exposed to auditory stimuli in Bulgarian, Czech, Polish, and Russian accompanied by a visual scene. The analysis of anticipatory eye movements has shown that native speakers of one Slavic language listening to sentences in another Slavic language, turn their attention to and begin fixating on the referent objects as soon as they identify a predicate. This experiment provides evidence for surprisal-based effects in intercomprehension.

@inproceedings{Kudera/etal:2022a,
title = {Comprehension of closely related languages: A visual world eye tracking study},
author = {Jacek Kudera and Philip Georgis and Hasan Md Tusfiqur Alam and Bernd M{\"o}bius and Tania Avgustinova and Dietrich Klakow},
url = {https://www.essv.de/pdf/2022_212_219.pdf?id=1161},
year = {2022},
date = {2022},
booktitle = {Elektronische Sprachsignalverarbeitung 2022, Tagungsband der 33. Konferenz (Sønderborg)},
pages = {212-219},
abstract = {We present results of an eye tracking experiment which aimed at testing sentence comprehension in closely related Slavic languages. Since none of the participants were trained in translation studies or Slavic linguistics, the study illustrates effects of intercomprehension. The participants were exposed to auditory stimuli in Bulgarian, Czech, Polish, and Russian accompanied by a visual scene. The analysis of anticipatory eye movements has shown that native speakers of one Slavic language listening to sentences in another Slavic language, turn their attention to and begin fixating on the referent objects as soon as they identify a predicate. This experiment provides evidence for surprisal-based effects in intercomprehension.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Abdullah, Badr M.; Möbius, Bernd; Klakow, Dietrich

Integrating form and meaning: A multi-task learning model for acoustic word embeddings Inproceedings

Proceedings of Interspeech 2022, pp. 1876-1880, 2022.

Models of acoustic word embeddings (AWEs) learn to map variable-length spoken word segments onto fixed-dimensionality vector representations such that different acoustic exemplars of the same word are projected nearby in the embedding space. In addition to their speech technology applications, AWE models have been shown to predict human performance on a variety of auditory lexical processing tasks. Current AWE models are based on neural networks and trained in a bottom-up approach that integrates acoustic cues to build up a word representation given an acoustic or symbolic supervision signal. Therefore, these models do not leverage or capture high-level lexical knowledge during the learning process. In this paper, we propose a multi-task learning model that incorporates top-down lexical knowledge into the training procedure of AWEs. Our model learns a mapping between the acoustic input and a lexical representation that encodes high-level information such as word semantics in addition to bottom-up form-based supervision. We experiment with three languages and demonstrate that incorporating lexical knowledge improves the embedding space discriminability and encourages the model to better separate lexical categories.

@inproceedings{Abdullah/etal:2022a,
title = {Integrating form and meaning: A multi-task learning model for acoustic word embeddings},
author = {Badr M. Abdullah and Bernd M{\"o}bius and Dietrich Klakow},
url = {https://www.isca-speech.org/archive/interspeech_2022/abdullah22_interspeech.html},
doi = {https://doi.org/10.21437/Interspeech.2022-626},
year = {2022},
date = {2022},
booktitle = {Proceedings of Interspeech 2022},
pages = {1876-1880},
abstract = {Models of acoustic word embeddings (AWEs) learn to map variable-length spoken word segments onto fixed-dimensionality vector representations such that different acoustic exemplars of the same word are projected nearby in the embedding space. In addition to their speech technology applications, AWE models have been shown to predict human performance on a variety of auditory lexical processing tasks. Current AWE models are based on neural networks and trained in a bottom-up approach that integrates acoustic cues to build up a word representation given an acoustic or symbolic supervision signal. Therefore, these models do not leverage or capture high-level lexical knowledge during the learning process. In this paper, we propose a multi-task learning model that incorporates top-down lexical knowledge into the training procedure of AWEs. Our model learns a mapping between the acoustic input and a lexical representation that encodes high-level information such as word semantics in addition to bottom-up form-based supervision. We experiment with three languages and demonstrate that incorporating lexical knowledge improves the embedding space discriminability and encourages the model to better separate lexical categories.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Zaitova, Iuliia; Abdullah, Badr M.; Klakow, Dietrich

Mapping Phonology to Semantics: A Computational Model of Cross-Lingual Spoken-Word Recognition Inproceedings

Proceedings of the Ninth Workshop on NLP for Similar Languages, Varieties and Dialects (October 2022, Gyeongju, Republic of Korea), Association for Computational Linguistics, pp. 54-63, 2022.

Closely related languages are often mutually intelligible to various degrees. Therefore, speakers of closely related languages are usually capable of (partially) comprehending each other’s speech without explicitly learning the target, second language. The cross-linguistic intelligibility among closely related languages is mainly driven by linguistic factors such as lexical similarities. This paper presents a computational model of spoken-word recognition and investigates its ability to recognize word forms from different languages than its native, training language. Our model is based on a recurrent neural network that learns to map a word’s phonological sequence onto a semantic representation of the word. Furthermore, we present a case study on the related Slavic languages and demonstrate that the cross-lingual performance of our model not only predicts mutual intelligibility to a large extent but also reflects the genetic classification of the languages in our study.

@inproceedings{zaitova-etal-2022-mapping,
title = {Mapping Phonology to Semantics: A Computational Model of Cross-Lingual Spoken-Word Recognition},
author = {Iuliia Zaitova and Badr M. Abdullah and Dietrich Klakow},
url = {https://aclanthology.org/2022.vardial-1.6/},
year = {2022},
date = {2022},
booktitle = {Proceedings of the Ninth Workshop on NLP for Similar Languages, Varieties and Dialects (October 2022, Gyeongju, Republic of Korea)},
pages = {54-63},
publisher = {Association for Computational Linguistics},
abstract = {Closely related languages are often mutually intelligible to various degrees. Therefore, speakers of closely related languages are usually capable of (partially) comprehending each other’s speech without explicitly learning the target, second language. The cross-linguistic intelligibility among closely related languages is mainly driven by linguistic factors such as lexical similarities. This paper presents a computational model of spoken-word recognition and investigates its ability to recognize word forms from different languages than its native, training language. Our model is based on a recurrent neural network that learns to map a word’s phonological sequence onto a semantic representation of the word. Furthermore, we present a case study on the related Slavic languages and demonstrate that the cross-lingual performance of our model not only predicts mutual intelligibility to a large extent but also reflects the genetic classification of the languages in our study.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Mosbach, Marius; Stenger, Irina; Avgustinova, Tania; Möbius, Bernd; Klakow, Dietrich

incom.py 2.0 - Calculating Linguistic Distances and Asymmetries in Auditory Perception of Closely Related Languages Inproceedings

Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021), INCOMA Ltd., pp. 968-977, Held Online, 2021.

We present an extended version of a tool developed for calculating linguistic distances and asymmetries in auditory perception of closely related languages. Along with evaluating the metrics available in the initial version of the tool, we introduce word adaptation entropy as an additional metric of linguistic asymmetry. Potential predictors of speech intelligibility are validated with human performance in spoken cognate recognition experiments for Bulgarian and Russian. Special attention is paid to the possibly different contributions of vowels and consonants in oral intercomprehension. Using incom.py 2.0 it is possible to calculate, visualize, and validate three measurement methods of linguistic distances and asymmetries as well as carrying out regression analyses in speech intelligibility between related languages.

@inproceedings{mosbach-etal-2021-incom,
title = {incom.py 2.0 - Calculating Linguistic Distances and Asymmetries in Auditory Perception of Closely Related Languages},
author = {Marius Mosbach and Irina Stenger and Tania Avgustinova and Bernd M{\"o}bius and Dietrich Klakow},
url = {https://aclanthology.org/2021.ranlp-1.110/},
year = {2021},
date = {2021-09-01},
booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)},
pages = {968-977},
publisher = {INCOMA Ltd.},
address = {Held Online},
abstract = {We present an extended version of a tool developed for calculating linguistic distances and asymmetries in auditory perception of closely related languages. Along with evaluating the metrics available in the initial version of the tool, we introduce word adaptation entropy as an additional metric of linguistic asymmetry. Potential predictors of speech intelligibility are validated with human performance in spoken cognate recognition experiments for Bulgarian and Russian. Special attention is paid to the possibly different contributions of vowels and consonants in oral intercomprehension. Using incom.py 2.0 it is possible to calculate, visualize, and validate three measurement methods of linguistic distances and asymmetries as well as carrying out regression analyses in speech intelligibility between related languages.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   B4 C4

Stenger, Irina; Avgustinova, Tania

On Slavic cognate recognition in context Inproceedings

P. Selegej, Vladimir et al. (Ed.): Computational Linguistics and Intellectual Technologies: Papers from the Annual International Conference ‘Dialogue’, pp. 660-668, Moscow, Russia, 2021.

This study contributes to a better understanding of reading intercomprehension as manifested in the intelligibility of East and South Slavic languages to Russian native speakers in contextualized cognate recognition experiments using Belarusian, Ukrainian, and Bulgarian stimuli. While the results mostly confirm the expected mutual intelligibility effects, we also register apparent processing difficulties in some of the cases. In search of an explanation, we examine the correlation of the experimentally obtained intercomprehension scores with various linguistic factors, which contribute to cognate intelligibility in a context, considering common predictors of intercomprehension associated with (i) morphology and orthography, (ii) lexis, and (iii) syntax.

@inproceedings{Stenger-dialog2021,
title = {On Slavic cognate recognition in context},
author = {Irina Stenger and Tania Avgustinova},
editor = {Vladimir P. Selegej et al.},
url = {https://www.dialog-21.ru/media/5547/stengeriplusavgustinovat027.pdf},
year = {2021},
date = {2021},
booktitle = {Computational Linguistics and Intellectual Technologies: Papers from the Annual International Conference ‘Dialogue’},
pages = {660-668},
address = {Moscow, Russia},
abstract = {This study contributes to a better understanding of reading intercomprehension as manifested in the intelligibility of East and South Slavic languages to Russian native speakers in contextualized cognate recognition experiments using Belarusian, Ukrainian, and Bulgarian stimuli. While the results mostly confirm the expected mutual intelligibility effects, we also register apparent processing difficulties in some of the cases. In search of an explanation, we examine the correlation of the experimentally obtained intercomprehension scores with various linguistic factors, which contribute to cognate intelligibility in a context, considering common predictors of intercomprehension associated with (i) morphology and orthography, (ii) lexis, and (iii) syntax.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Stenger, Irina; Avgustinova, Tania

Multilingual learnability and reaction time in online Slavic intercomprehension experiments Inproceedings

Koeva, Svetla; Stamenov, Maksim (Ed.): Proceedings of the International Annual Conference of the Institute for Bulgarian Language, 2, Marin Drinov Academic Publishers, pp. 191-200, Sofia, Bulgaria, 2021.

Receptive multilingualism is a multidimensional and multifactorial phenomenon that crucially depends on the mutual intelligibility of closely related languages. As a strategy, it predominantly capitalizes upon a dynamic integration of linguistic, communicative, contextual, and socio-cognitive aspects. Relevant linguistic determinants (especially linguistic distances) along with recognizable extra-linguistic influences (such as attitude and exposure) have recently enjoyed increased attention in the research community. In our online (web-based) intercomprehension experiments, we have observed learning effects that appear to be empirically associated with individual cognitive skills. For this study, we tested 185 Russian subjects in a written word recognition task which essentially involved cognate guessing in Belarusian, Bulgarian, Macedonian, Serbian, and Ukrainian. The subjects had to translate the stimuli presented online into their native language, i.e. Russian. To reveal implicit multilingual learnability, we correlate the obtained intercomprehension scores with the detected reaction times, taking into consideration the potential influence of the experiment rank on the reaction time too.

@inproceedings{Stenger-CONFIBL2021,
title = {Multilingual learnability and reaction time in online Slavic intercomprehension experiments},
author = {Irina Stenger and Tania Avgustinova},
editor = {Svetla Koeva and Maksim Stamenov},
url = {https://ibl.bas.bg/wp-content/uploads/2021/06/Sbornik_s_dokladi_CONFIBL2021_tom_2_FINAL.pdf},
year = {2021},
date = {2021},
booktitle = {Proceedings of the International Annual Conference of the Institute for Bulgarian Language},
pages = {191-200},
publisher = {Marin Drinov Academic Publishers},
address = {Sofia, Bulgaria},
abstract = {Receptive multilingualism is a multidimensional and multifactorial phenomenon that crucially depends on the mutual intelligibility of closely related languages. As a strategy, it predominantly capitalizes upon a dynamic integration of linguistic, communicative, contextual, and socio-cognitive aspects. Relevant linguistic determinants (especially linguistic distances) along with recognizable extra-linguistic influences (such as attitude and exposure) have recently enjoyed increased attention in the research community. In our online (web-based) intercomprehension experiments, we have observed learning effects that appear to be empirically associated with individual cognitive skills. For this study, we tested 185 Russian subjects in a written word recognition task which essentially involved cognate guessing in Belarusian, Bulgarian, Macedonian, Serbian, and Ukrainian. The subjects had to translate the stimuli presented online into their native language, i.e. Russian. To reveal implicit multilingual learnability, we correlate the obtained intercomprehension scores with the detected reaction times, taking into consideration the potential influence of the experiment rank on the reaction time too.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Abdullah, Badr M.; Mosbach, Marius; Zaitova, Iuliia; Möbius, Bernd; Klakow, Dietrich

Do Acoustic Word Embeddings Capture Phonological Similarity? An Empirical Study Inproceedings

Proceedings of Interspeech 2020, 2021.

Several variants of deep neural networks have been successfully employed for building parametric models that project variable-duration spoken word segments onto fixed-size vector representations, or acoustic word embeddings (AWEs). However, it remains unclear to what degree we can rely on the distance in the emerging AWE space as an estimate of word-form similarity. In this paper, we ask: does the distance in the acoustic embedding space correlate with phonological dissimilarity? To answer this question, we empirically investigate the performance of supervised approaches for AWEs with different neural architectures and learning objectives. We train AWE models in controlled settings for two languages (German and Czech) and evaluate the embeddings on two tasks: word discrimination and phonological similarity. Our experiments show that (1) the distance in the embedding space in the best cases only moderately correlates with phonological distance, and (2) improving the performance on the word discrimination task does not necessarily yield models that better reflect word phonological similarity. Our findings highlight the necessity to rethink the current intrinsic evaluations for AWEs.

@inproceedings{Abdullah2021DoAW,
title = {Do Acoustic Word Embeddings Capture Phonological Similarity? An Empirical Study},
author = {Badr M. Abdullah and Marius Mosbach and Iuliia Zaitova and Bernd M{\"o}bius and Dietrich Klakow},
url = {https://arxiv.org/abs/2106.08686},
year = {2021},
date = {2021},
booktitle = {Proceedings of Interspeech 2020},
abstract = {Several variants of deep neural networks have been successfully employed for building parametric models that project variable-duration spoken word segments onto fixed-size vector representations, or acoustic word embeddings (AWEs). However, it remains unclear to what degree we can rely on the distance in the emerging AWE space as an estimate of word-form similarity. In this paper, we ask: does the distance in the acoustic embedding space correlate with phonological dissimilarity? To answer this question, we empirically investigate the performance of supervised approaches for AWEs with different neural architectures and learning objectives. We train AWE models in controlled settings for two languages (German and Czech) and evaluate the embeddings on two tasks: word discrimination and phonological similarity. Our experiments show that (1) the distance in the embedding space in the best cases only moderately correlates with phonological distance, and (2) improving the performance on the word discrimination task does not necessarily yield models that better reflect word phonological similarity. Our findings highlight the necessity to rethink the current intrinsic evaluations for AWEs.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   C4 B4

Mayn, Alexandra; Abdullah, Badr M.; Klakow, Dietrich

Familiar words but strange voices: Modelling the influence of speech variability on word recognition Inproceedings

Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, Association for Computational Linguistics, pp. 96-102, Online, 2021.

We present a deep neural model of spoken word recognition which is trained to retrieve the meaning of a word (in the form of a word embedding) given its spoken form, a task which resembles that faced by a human listener. Furthermore, we investigate the influence of variability in speech signals on the model’s performance. To this end, we conduct of set of controlled experiments using word-aligned read speech data in German. Our experiments show that (1) the model is more sensitive to dialectical variation than gender variation, and (2) recognition performance of word cognates from related languages reflect the degree of relatedness between languages in our study. Our work highlights the feasibility of modeling human speech perception using deep neural networks.

@inproceedings{mayn-etal-2021-familiar,
title = {Familiar words but strange voices: Modelling the influence of speech variability on word recognition},
author = {Alexandra Mayn and Badr M. Abdullah and Dietrich Klakow},
url = {https://aclanthology.org/2021.eacl-srw.14},
year = {2021},
date = {2021},
booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop},
pages = {96-102},
publisher = {Association for Computational Linguistics},
address = {Online},
abstract = {We present a deep neural model of spoken word recognition which is trained to retrieve the meaning of a word (in the form of a word embedding) given its spoken form, a task which resembles that faced by a human listener. Furthermore, we investigate the influence of variability in speech signals on the model’s performance. To this end, we conduct of set of controlled experiments using word-aligned read speech data in German. Our experiments show that (1) the model is more sensitive to dialectical variation than gender variation, and (2) recognition performance of word cognates from related languages reflect the degree of relatedness between languages in our study. Our work highlights the feasibility of modeling human speech perception using deep neural networks.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Macher, Nicole; Abdullah, Badr M.; Brouwer, Harm; Klakow, Dietrich

Do we read what we hear? Modeling orthographic influences on spoken word recognition Inproceedings

Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, Association for Computational Linguistics, pp. 16-22, Online, 2021.

Theories and models of spoken word recognition aim to explain the process of accessing lexical knowledge given an acoustic realization of a word form. There is consensus that phonological and semantic information is crucial for this process. However, there is accumulating evidence that orthographic information could also have an impact on auditory word recognition. This paper presents two models of spoken word recognition that instantiate different hypotheses regarding the influence of orthography on this process. We show that these models reproduce human-like behavior in different ways and provide testable hypotheses for future research on the source of orthographic effects in spoken word recognition.

@inproceedings{macher-etal-2021-read,
title = {Do we read what we hear? Modeling orthographic influences on spoken word recognition},
author = {Nicole Macher and Badr M. Abdullah and Harm Brouwer and Dietrich Klakow},
url = {https://aclanthology.org/2021.eacl-srw.3},
year = {2021},
date = {2021},
booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop},
pages = {16-22},
publisher = {Association for Computational Linguistics},
address = {Online},
abstract = {Theories and models of spoken word recognition aim to explain the process of accessing lexical knowledge given an acoustic realization of a word form. There is consensus that phonological and semantic information is crucial for this process. However, there is accumulating evidence that orthographic information could also have an impact on auditory word recognition. This paper presents two models of spoken word recognition that instantiate different hypotheses regarding the influence of orthography on this process. We show that these models reproduce human-like behavior in different ways and provide testable hypotheses for future research on the source of orthographic effects in spoken word recognition.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   A1 C4

Jágrová, Klára; Hedderich, Michael; Mosbach, Marius; Avgustinova, Tania; Klakow, Dietrich

On the Correlation of Context-Aware Language Models With the Intelligibility of Polish Target Words to Czech Readers Journal Article

Frontiers in Psychology, 12, pp. 2296, 2021, ISSN 1664-1078.

This contribution seeks to provide a rational probabilistic explanation for the intelligibility of words in a genetically related language that is unknown to the reader, a phenomenon referred to as intercomprehension. In this research domain, linguistic distance, among other factors, was proved to correlate well with the mutual intelligibility of individual words. However, the role of context for the intelligibility of target words in sentences was subject to very few studies. To address this, we analyze data from web-based experiments in which Czech (CS) respondents were asked to translate highly predictable target words at the final position of Polish sentences. We compare correlations of target word intelligibility with data from 3-g language models (LMs) to their correlations with data obtained from context-aware LMs. More specifically, we evaluate two context-aware LM architectures: Long Short-Term Memory (LSTMs) that can, theoretically, take infinitely long-distance dependencies into account and Transformer-based LMs which can access the whole input sequence at the same time. We investigate how their use of context affects surprisal and its correlation with intelligibility.

@article{10.3389/fpsyg.2021.662277,
title = {On the Correlation of Context-Aware Language Models With the Intelligibility of Polish Target Words to Czech Readers},
author = {Kl{\'a}ra J{\'a}grov{\'a} and Michael Hedderich and Marius Mosbach and Tania Avgustinova and Dietrich Klakow},
url = {https://www.frontiersin.org/articles/10.3389/fpsyg.2021.662277/full},
doi = {https://doi.org/10.3389/fpsyg.2021.662277},
year = {2021},
date = {2021},
journal = {Frontiers in Psychology},
pages = {2296},
volume = {12},
abstract = {This contribution seeks to provide a rational probabilistic explanation for the intelligibility of words in a genetically related language that is unknown to the reader, a phenomenon referred to as intercomprehension. In this research domain, linguistic distance, among other factors, was proved to correlate well with the mutual intelligibility of individual words. However, the role of context for the intelligibility of target words in sentences was subject to very few studies. To address this, we analyze data from web-based experiments in which Czech (CS) respondents were asked to translate highly predictable target words at the final position of Polish sentences. We compare correlations of target word intelligibility with data from 3-g language models (LMs) to their correlations with data obtained from context-aware LMs. More specifically, we evaluate two context-aware LM architectures: Long Short-Term Memory (LSTMs) that can, theoretically, take infinitely long-distance dependencies into account and Transformer-based LMs which can access the whole input sequence at the same time. We investigate how their use of context affects surprisal and its correlation with intelligibility.},
pubstate = {published},
type = {article}
}

Copy BibTeX to Clipboard

Projects:   B4 C4

Kudera, Jacek; Tavi, Lauri; Möbius, Bernd; Avgustinova, Tania; Klakow, Dietrich

The effect of surprisal on articulatory gestures in Polish consonant-to-vowel transitions: A pilot EMA study Inproceedings

14. ITG-Konferenz, ITG-Fachbericht 298: Speech Communication, pp. 179-183, Kiel, Germany, 2021, ISBN 978-3-8007-5627-8.

This study is concerned with the relation between the information-theoretic notion of surprisal and articulatory gesture in Polish consonant-to-vowel transitions. It addresses the question of the influence of diphone predictability on spectral trajectories and articulatory gestures by relating the effect of surprisal with motor fluency. The study combines the computation of locus equations (LE) with kinematic data obtained from electromagnetic articulograph (EMA). The kinematic and acoustic data showed that a small coarticulation effect was present in the highand low-surprisal clusters. Regardless of some small discrepancies across the measures, a high degree of overlap of adjacent segments is reported for the mid-surprisal group in both domains. Two explanations of the observed effect are proposed. The first refers to low-surprisal coarticulation resistance and suggests the need to disambiguate predictable sequences. The second, observed in high surprisal clusters, refers to the prominence given to emphasize the unexpected concatenation.

@inproceedings{Kudera/etal:2021c,
title = {The effect of surprisal on articulatory gestures in Polish consonant-to-vowel transitions: A pilot EMA study},
author = {Jacek Kudera and Lauri Tavi and Bernd M{\"o}bius and Tania Avgustinova and Dietrich Klakow},
url = {https://ieeexplore.ieee.org/document/9657527},
year = {2021},
date = {2021},
booktitle = {14. ITG-Konferenz, ITG-Fachbericht 298: Speech Communication},
isbn = {978-3-8007-5627-8},
pages = {179-183},
address = {Kiel, Germany},
abstract = {This study is concerned with the relation between the information-theoretic notion of surprisal and articulatory gesture in Polish consonant-to-vowel transitions. It addresses the question of the influence of diphone predictability on spectral trajectories and articulatory gestures by relating the effect of surprisal with motor fluency. The study combines the computation of locus equations (LE) with kinematic data obtained from electromagnetic articulograph (EMA). The kinematic and acoustic data showed that a small coarticulation effect was present in the highand low-surprisal clusters. Regardless of some small discrepancies across the measures, a high degree of overlap of adjacent segments is reported for the mid-surprisal group in both domains. Two explanations of the observed effect are proposed. The first refers to low-surprisal coarticulation resistance and suggests the need to disambiguate predictable sequences. The second, observed in high surprisal clusters, refers to the prominence given to emphasize the unexpected concatenation.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   C4

Successfully