@inproceedings{abdullah24_interspeech,
title = {Wave to Interlingua: Analyzing Representations of Multilingual Speech Transformers for Spoken Language Translation},
author = {Badr M. Abdullah and Mohammed Maqsood Shaik and Dietrich Klakow},
url = {https://www.isca-archive.org/interspeech_2024/abdullah24_interspeech.html},
doi = {https://doi.org/10.21437/Interspeech.2024-2109},
year = {2024},
date = {2024-12-11},
booktitle = {Interspeech 2024},
issn = {2958-1796},
pages = {362-366},
abstract = {
In Transformer-based Speech-to-Text (S2T) translation, an encoder-decoder model is trained end-to-end to take as input an untranscribed acoustic signal in the source language and directly generate a text translation in the target language. S2T translation models can also be trained in multilingual setups where a single front-end speech encoder is shared across multiple languages. A lingering question, however, is whether the encoder represents spoken utterances in a language-neutral space. In this paper, we present an interpretability study of encoder representations in a multilingual speech translation Transformer via various probing tasks. Our main findings show that while encoder representations are not entirely language-neutral, there exists a semantic subspace that is shared across different languages. Furthermore, we discuss our findings and the implication of our study on cross-lingual learning for spoken language understanding tasks.
},
pubstate = {published},
type = {inproceedings}
}