@inproceedings{abdullah24_interspeech, title = {Wave to Interlingua: Analyzing Representations of Multilingual Speech Transformers for Spoken Language Translation}, author = {Badr M. Abdullah and Mohammed Maqsood Shaik and Dietrich Klakow}, url = {https://www.isca-archive.org/interspeech_2024/abdullah24_interspeech.html}, doi = {https://doi.org/10.21437/Interspeech.2024-2109}, year = {2024}, date = {2024-12-11}, booktitle = {Interspeech 2024}, issn = {2958-1796}, pages = {362-366}, abstract = {
In Transformer-based Speech-to-Text (S2T) translation, an encoder-decoder model is trained end-to-end to take as input an untranscribed acoustic signal in the source language and directly generate a text translation in the target language. S2T translation models can also be trained in multilingual setups where a single front-end speech encoder is shared across multiple languages. A lingering question, however, is whether the encoder represents spoken utterances in a language-neutral space. In this paper, we present an interpretability study of encoder representations in a multilingual speech translation Transformer via various probing tasks. Our main findings show that while encoder representations are not entirely language-neutral, there exists a semantic subspace that is shared across different languages. Furthermore, we discuss our findings and the implication of our study on cross-lingual learning for spoken language understanding tasks.
}, pubstate = {published}, type = {inproceedings} }