Publications

Kunilovskaya, Maria; Dutta Chowdhury, Koel; Przybyl, Heike; EspaƱa-Bonet, Cristina; van Genabith, Josef

Mitigating Translationese with GPT-4: Strategies and Performance Inproceedings

Proceedings of the 25th Annual Conference of the European Association for Machine Translation, 1, European Association for Machine Translation, pp. 411ā€“430, 2024.

Translations differ in systematic ways from texts originally authored in the same language. These differences, collectively known as translationese, can pose challenges in cross-lingual natural language processing: models trained or tested on translated input might struggle when presented with non-translated language.Translationese mitigation can alleviate this problem. This study investigates the generative capacities of GPT-4 to reduce translationese in human-translated texts. The task is framed as a rewriting process aimed
at modified translations indistinguishable from the original text in the target language. Our focus is on prompt engineering that tests the utility of linguistic knowledge as part of the instruction for GPT-4. Through a series of prompt design experiments, we show that GPT4-generated revisions are more similar to originals in the target language when the prompts incorporate specific linguistic instructions instead of relying solely on the modelā€™s internal knowledge. Furthermore, we release the segment-aligned bidirectional Germanā€“English data built from the Europarl corpus that underpins this study.

@inproceedings{kunilovskaya-etal-2024-mitigating,
title = {Mitigating Translationese with GPT-4: Strategies and Performance},
author = {Maria Kunilovskaya and Koel Dutta Chowdhury and Heike Przybyl and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
url = {https://eamt2024.github.io/proceedings/vol1.pdf},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 25th Annual Conference of the European Association for Machine Translation},
pages = {411ā€“430},
publisher = {European Association for Machine Translation},
abstract = {Translations differ in systematic ways from texts originally authored in the same language. These differences, collectively known as translationese, can pose challenges in cross-lingual natural language processing: models trained or tested on translated input might struggle when presented with non-translated language.Translationese mitigation can alleviate this problem. This study investigates the generative capacities of GPT-4 to reduce translationese in human-translated texts. The task is framed as a rewriting process aimed at modified translations indistinguishable from the original text in the target language. Our focus is on prompt engineering that tests the utility of linguistic knowledge as part of the instruction for GPT-4. Through a series of prompt design experiments, we show that GPT4-generated revisions are more similar to originals in the target language when the prompts incorporate specific linguistic instructions instead of relying solely on the modelā€™s internal knowledge. Furthermore, we release the segment-aligned bidirectional Germanā€“English data built from the Europarl corpus that underpins this study.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   B6 B7

Bafna, Niyati; EspaƱa-Bonet, Cristina; van Genabith, Josef; Sagot, BenoƮt; Bawden, Rachel

When Your Cousin Has the Right Connections: Unsupervised Bilingual Lexicon Induction for Related Data-Imbalanced Languages Inproceedings

Calzolari, Nicoletta; Kan, Min-Yen; Hoste, Veronique; Lenci, Alessandro; Sakti, Sakriani; Xue, Nianwen (Ed.): Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), ELRA and ICCL, pp. 17544-17556, Torino, Italia, 2024.

Most existing approaches for unsupervised bilingual lexicon induction (BLI) depend on good quality static or contextual embeddings requiring large monolingual corpora for both languages. However, unsupervised BLI is most likely to be useful for low-resource languages (LRLs), where large datasets are not available. Often we are interested in building bilingual resources for LRLs against related high-resource languages (HRLs), resulting in severely imbalanced data settings for BLI. We first show that state-of-the-art BLI methods in the literature exhibit near-zero performance for severely data-imbalanced language pairs, indicating that these settings require more robust techniques. We then present a new method for unsupervised BLI between a related LRL and HRL that only requires inference on a masked language model of the HRL, and demonstrate its effectiveness on truly low-resource languages Bhojpuri and Magahi (with <5M monolingual tokens each), against Hindi. We further present experiments on (mid-resource) Marathi and Nepali to compare approach performances by resource range, and release our resulting lexicons for five low-resource Indic languages: Bhojpuri, Magahi, Awadhi, Braj, and Maithili, against Hindi.

@inproceedings{bafna-etal-2024-cousin-right,
title = {When Your Cousin Has the Right Connections: Unsupervised Bilingual Lexicon Induction for Related Data-Imbalanced Languages},
author = {Niyati Bafna and Cristina Espa{\~n}a-Bonet and Josef van Genabith and BenoƮt Sagot and Rachel Bawden},
editor = {Nicoletta Calzolari and Min-Yen Kan and Veronique Hoste and Alessandro Lenci and Sakriani Sakti and Nianwen Xue},
url = {https://aclanthology.org/2024.lrec-main.1526},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)},
pages = {17544-17556},
publisher = {ELRA and ICCL},
address = {Torino, Italia},
abstract = {Most existing approaches for unsupervised bilingual lexicon induction (BLI) depend on good quality static or contextual embeddings requiring large monolingual corpora for both languages. However, unsupervised BLI is most likely to be useful for low-resource languages (LRLs), where large datasets are not available. Often we are interested in building bilingual resources for LRLs against related high-resource languages (HRLs), resulting in severely imbalanced data settings for BLI. We first show that state-of-the-art BLI methods in the literature exhibit near-zero performance for severely data-imbalanced language pairs, indicating that these settings require more robust techniques. We then present a new method for unsupervised BLI between a related LRL and HRL that only requires inference on a masked language model of the HRL, and demonstrate its effectiveness on truly low-resource languages Bhojpuri and Magahi (with <5M monolingual tokens each), against Hindi. We further present experiments on (mid-resource) Marathi and Nepali to compare approach performances by resource range, and release our resulting lexicons for five low-resource Indic languages: Bhojpuri, Magahi, Awadhi, Braj, and Maithili, against Hindi.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Jalota, Rricha; Dutta Chowdhury, Koel; EspaƱa-Bonet, Cristina; van Genabith, Josef

Translating away Translationese without Parallel Data Inproceedings

Bouamor, Houda; Pino, Juan; Bali, Kalika (Ed.): Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, pp. 7086-7100, Singapore, 2023.

Translated texts exhibit systematic linguistic differences compared to original texts in the same language, and these differences are referred to as translationese. Translationese has effects on various cross-lingual natural language processing tasks, potentially leading to biased results. In this paper, we explore a novel approach to reduce translationese in translated texts: translation-based style transfer. As there are no parallel human-translated and original data in the same language, we use a self-supervised approach that can learn from comparable (rather than parallel) mono-lingual original and translated data. However, even this self-supervised approach requires some parallel data for validation. We show how we can eliminate the need for parallel validation data by combining the self-supervised loss with an unsupervised loss. This unsupervised loss leverages the original language model loss over the style-transferred output and a semantic similarity loss between the input and style-transferred output. We evaluate our approach in terms of original vs. translationese binary classification in addition to measuring content preservation and target-style fluency. The results show that our approach is able to reduce translationese classifier accuracy to a level of a random classifier after style transfer while adequately preserving the content and fluency in the target original style.

@inproceedings{jalota2023translating,
title = {Translating away Translationese without Parallel Data},
author = {Rricha Jalota and Koel Dutta Chowdhury and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
editor = {Houda Bouamor and Juan Pino and Kalika Bali},
url = {https://aclanthology.org/2023.emnlp-main.438/},
doi = {https://doi.org/10.18653/v1/2023.emnlp-main.438},
year = {2023},
date = {2023},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
pages = {7086-7100},
publisher = {Association for Computational Linguistics},
address = {Singapore},
abstract = {Translated texts exhibit systematic linguistic differences compared to original texts in the same language, and these differences are referred to as translationese. Translationese has effects on various cross-lingual natural language processing tasks, potentially leading to biased results. In this paper, we explore a novel approach to reduce translationese in translated texts: translation-based style transfer. As there are no parallel human-translated and original data in the same language, we use a self-supervised approach that can learn from comparable (rather than parallel) mono-lingual original and translated data. However, even this self-supervised approach requires some parallel data for validation. We show how we can eliminate the need for parallel validation data by combining the self-supervised loss with an unsupervised loss. This unsupervised loss leverages the original language model loss over the style-transferred output and a semantic similarity loss between the input and style-transferred output. We evaluate our approach in terms of original vs. translationese binary classification in addition to measuring content preservation and target-style fluency. The results show that our approach is able to reduce translationese classifier accuracy to a level of a random classifier after style transfer while adequately preserving the content and fluency in the target original style.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Borah, Angana; Pylypenko, Daria; EspaƱa-Bonet, Cristina; van Genabith, Josef

Measuring Spurious Correlation in Classification: "Clever Hans" in Translationese Inproceedings

Mitkov, Ruslan; Angelova, Galia (Ed.): Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing, INCOMA Ltd., Shoumen, Bulgaria, pp. 196-206, Varna, Bulgaria, 2023.
Recent work has shown evidence of „Clever Hans“ behavior in high-performance neural translationese classifiers, where BERT-based classifiers capitalize on spurious correlations, in particular topic information, between data and target classification labels, rather than genuine translationese signals. Translationese signals are subtle (especially for professional translation) and compete with many other signals in the data such as genre, style, author, and, in particular, topic. This raises the general question of how much of the performance of a classifier is really due to spurious correlations in the data versus the signals actually targeted for by the classifier, especially for subtle target signals and in challenging (low resource) data settings. We focus on topic-based spurious correlation and approach the question from two directions: (i) where we have no knowledge about spurious topic information and its distribution in the data, (ii) where we have some indication about the nature of spurious topic correlations. For (i) we develop a measure from first principles capturing alignment of unsupervised topics with target classification labels as an indication of spurious topic information in the data. We show that our measure is the same as purity in clustering and propose a „topic floor“ (as in a „noise floor“) for classification. For (ii) we investigate masking of known spurious topic carriers in classification. Both (i) and (ii) contribute to quantifying and (ii) to mitigating spurious correlations.

@inproceedings{borah-etal-2023-measuring,
title = {Measuring Spurious Correlation in Classification: "Clever Hans" in Translationese},
author = {Angana Borah and Daria Pylypenko and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
editor = {Ruslan Mitkov and Galia Angelova},
url = {https://aclanthology.org/2023.ranlp-1.22},
year = {2023},
date = {2023},
booktitle = {Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing},
pages = {196-206},
publisher = {INCOMA Ltd., Shoumen, Bulgaria},
address = {Varna, Bulgaria},
abstract = {

Recent work has shown evidence of "Clever Hans" behavior in high-performance neural translationese classifiers, where BERT-based classifiers capitalize on spurious correlations, in particular topic information, between data and target classification labels, rather than genuine translationese signals. Translationese signals are subtle (especially for professional translation) and compete with many other signals in the data such as genre, style, author, and, in particular, topic. This raises the general question of how much of the performance of a classifier is really due to spurious correlations in the data versus the signals actually targeted for by the classifier, especially for subtle target signals and in challenging (low resource) data settings. We focus on topic-based spurious correlation and approach the question from two directions: (i) where we have no knowledge about spurious topic information and its distribution in the data, (ii) where we have some indication about the nature of spurious topic correlations. For (i) we develop a measure from first principles capturing alignment of unsupervised topics with target classification labels as an indication of spurious topic information in the data. We show that our measure is the same as purity in clustering and propose a "topic floor" (as in a "noise floor") for classification. For (ii) we investigate masking of known spurious topic carriers in classification. Both (i) and (ii) contribute to quantifying and (ii) to mitigating spurious correlations.
},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Dutta Chowdhury, Koel; Jalota, Rricha; van Genabith, Josef; EspaƱa-Bonet, Cristina

Towards Debiasing Translation Artifacts Inproceedings

Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, pp. 3983-3991, Seattle, United States, July 2022, 2022.

Cross-lingual natural language processing relies on translation, either by humans or machines, at different levels, from translating training data to translating test sets. However, compared to original texts in the same language, translations possess distinct qualities referred to as translationese. Previous research has shown that these translation artifacts influence the performance of a variety of cross-lingual tasks. In this work, we propose a novel approach to reducing translationese by extending an established bias-removal technique. We use the Iterative Null-space Projection (INLP) algorithm, and show by measuring classification accuracy before and after debiasing, that translationese is reduced at both sentence and word level. We evaluate the utility of debiasing translationese on a natural language inference (NLI) task, and show that by reducing this bias, NLI accuracy improves. To the best of our knowledge, this is the first study to debias translationese as represented in latent embedding space.

@inproceedings{Chowdhury_2022_Debiasing,
title = {Towards Debiasing Translation Artifacts},
author = {Koel Dutta Chowdhury and Rricha Jalota and Josef van Genabith and Cristina Espa{\~n}a-Bonet},
url = {https://aclanthology.org/2022.naacl-main.292/},
doi = {https://doi.org/10.18653/v1/2022.naacl-main.292},
year = {2022},
date = {2022},
booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
pages = {3983-3991},
publisher = {Association for Computational Linguistics},
address = {Seattle, United States, July 2022},
abstract = {Cross-lingual natural language processing relies on translation, either by humans or machines, at different levels, from translating training data to translating test sets. However, compared to original texts in the same language, translations possess distinct qualities referred to as translationese. Previous research has shown that these translation artifacts influence the performance of a variety of cross-lingual tasks. In this work, we propose a novel approach to reducing translationese by extending an established bias-removal technique. We use the Iterative Null-space Projection (INLP) algorithm, and show by measuring classification accuracy before and after debiasing, that translationese is reduced at both sentence and word level. We evaluate the utility of debiasing translationese on a natural language inference (NLI) task, and show that by reducing this bias, NLI accuracy improves. To the best of our knowledge, this is the first study to debias translationese as represented in latent embedding space.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

EspaƱa-Bonet, Cristina; BarrĆ³n-CedeƱo, Alberto

The (Undesired) Attenuation of Human Biases by Multilinguality Inproceedings

Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, pp. 2056ā€“2077, Online and Abu Dhabi, UAE, Dec 2022, 2022.
Some human preferences are universal. The odor of vanilla is perceived as pleasant all around the world. We expect neural models trained on human texts to exhibit these kind of preferences, i.e. biases, but we show that this is not always the case. We explore 16 static and contextual embedding models in 9 languages and, when possible, compare them under similar training conditions. We introduce and release CA-WEAT, multilingual cultural aware tests to quantify biases, and compare them to previous English-centric tests. Our experiments confirm that monolingual static embeddings do exhibit human biases, but values differ across languages, being far from universal. Biases are less evident in contextual models, to the point that the original human association might be reversed. Multilinguality proves to be another variable that attenuates and even reverses the effect of the bias, specially in contextual multilingual models. In order to explain this variance among models and languages, we examine the effect of asymmetries in the training corpus, departures from isomorphism in multilingual embedding spaces and discrepancies in the testing measures between languages.

@inproceedings{espana-bonet-barron-cedeno-2022-undesired,
title = {The (Undesired) Attenuation of Human Biases by Multilinguality},
author = {Cristina Espa{\~n}a-Bonet and Alberto BarrĆ³n-Cede{\~n}o},
url = {https://aclanthology.org/2022.emnlp-main.133},
year = {2022},
date = {2022},
booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
pages = {2056ā€“2077},
publisher = {Association for Computational Linguistics},
address = {Online and Abu Dhabi, UAE, Dec 2022},
abstract = {

Some human preferences are universal. The odor of vanilla is perceived as pleasant all around the world. We expect neural models trained on human texts to exhibit these kind of preferences, i.e. biases, but we show that this is not always the case. We explore 16 static and contextual embedding models in 9 languages and, when possible, compare them under similar training conditions. We introduce and release CA-WEAT, multilingual cultural aware tests to quantify biases, and compare them to previous English-centric tests. Our experiments confirm that monolingual static embeddings do exhibit human biases, but values differ across languages, being far from universal. Biases are less evident in contextual models, to the point that the original human association might be reversed. Multilinguality proves to be another variable that attenuates and even reverses the effect of the bias, specially in contextual multilingual models. In order to explain this variance among models and languages, we examine the effect of asymmetries in the training corpus, departures from isomorphism in multilingual embedding spaces and discrepancies in the testing measures between languages.
},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Bafna, Niyati; van Genabith, Josef; EspaƱa-Bonet, Cristina; ZabokrtskĆ½, ZdenĆŖk

Combining Noisy Semantic Signals with Orthographic Cues: Cognate Induction for the Indic Dialect Continuum Inproceedings

Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL), Association for Computational Linguistics, pp. 110-131, Abu Dhabi, UAE, Dec 2022, 2022.
We present a novel method for unsupervised cognate/borrowing identification from monolingual corpora designed for low and extremely low resource scenarios, based on combining noisy semantic signals from joint bilingual spaces with orthographic cues modelling sound change. We apply our method to the North Indian dialect continuum, containing several dozens of dialects and languages spoken by more than 100 million people. Many of these languages are zero-resource and therefore natural language processing for them is non-existent. We first collect monolingual data for 26 Indic languages, 16 of which were previously zero-resource, and perform exploratory character, lexical and subword cross-lingual alignment experiments for the first time at this scale on this dialect continuum. We create bilingual evaluation lexicons against Hindi for 20 of the languages. We then apply our cognate identification method on the data, and show that our method outperforms both traditional orthography baselines as well as EM-style learnt edit distance matrices. To the best of our knowledge, this is the first work to combine traditional orthographic cues with noisy bilingual embeddings to tackle unsupervised cognate detection in a (truly) low-resource setup, showing that even noisy bilingual embeddings can act as good guides for this task. We release our multilingual dialect corpus, called HinDialect, as well as our scripts for evaluation data collection and cognate induction.

@inproceedings{bafna-etal-2022-combining,
title = {Combining Noisy Semantic Signals with Orthographic Cues: Cognate Induction for the Indic Dialect Continuum},
author = {Niyati Bafna and Josef van Genabith and Cristina Espa{\~n}a-Bonet and ZdenĆŖk ZabokrtskĆ½},
url = {https://aclanthology.org/2022.conll-1.9},
year = {2022},
date = {2022},
booktitle = {Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)},
pages = {110-131},
publisher = {Association for Computational Linguistics},
address = {Abu Dhabi, UAE, Dec 2022},
abstract = {

We present a novel method for unsupervised cognate/borrowing identification from monolingual corpora designed for low and extremely low resource scenarios, based on combining noisy semantic signals from joint bilingual spaces with orthographic cues modelling sound change. We apply our method to the North Indian dialect continuum, containing several dozens of dialects and languages spoken by more than 100 million people. Many of these languages are zero-resource and therefore natural language processing for them is non-existent. We first collect monolingual data for 26 Indic languages, 16 of which were previously zero-resource, and perform exploratory character, lexical and subword cross-lingual alignment experiments for the first time at this scale on this dialect continuum. We create bilingual evaluation lexicons against Hindi for 20 of the languages. We then apply our cognate identification method on the data, and show that our method outperforms both traditional orthography baselines as well as EM-style learnt edit distance matrices. To the best of our knowledge, this is the first work to combine traditional orthographic cues with noisy bilingual embeddings to tackle unsupervised cognate detection in a (truly) low-resource setup, showing that even noisy bilingual embeddings can act as good guides for this task. We release our multilingual dialect corpus, called HinDialect, as well as our scripts for evaluation data collection and cognate induction.
},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Amponsah-Kaakyire, Kwabena; Pylypenko, Daria; van Genabith, Josef; EspaƱa-Bonet, Cristina

Explaining Translationese: why are Neural Classifiers Better and what do they Learn? Inproceedings

Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP, Association for Computational Linguistics, pp. 281-296, Abu Dhabi, United Arab Emirates (Hybrid), Dec 2022, 2022.

Recent work has shown that neural feature- and representation-learning, e.g. BERT, achieves superior performance over traditional manual feature engineering based approaches, with e.g. SVMs, in translationese classification tasks. Previous research did not showĀ (i)Ā whether the difference is because of the features, the classifiers or both, andĀ (ii)Ā what the neural classifiers actually learn. To addressĀ (i), we carefully design experiments that swap features between BERT- and SVM-based classifiers. We show that an SVM fed with BERT representations performs at the level of the best BERT classifiers, while BERT learning and using handcrafted features performs at the level of an SVM using handcrafted features. This shows that the performance differences are due to the features. To addressĀ (ii)Ā we use integrated gradients and find thatĀ (a)Ā there is indication that information captured by hand-crafted features is only a subset of what BERT learns, andĀ (b)Ā part of BERT’s top performance results are due to BERT learning topic differences and spurious correlations with translationese.

@inproceedings{amponsah-kaakyire-etal-2022-explaining,
title = {Explaining Translationese: why are Neural Classifiers Better and what do they Learn?},
author = {Kwabena Amponsah-Kaakyire and Daria Pylypenko and Josef van Genabith and Cristina Espa{\~n}a-Bonet},
url = {https://aclanthology.org/2022.blackboxnlp-1.23},
doi = {https://doi.org/10.48550/ARXIV.2210.13391},
year = {2022},
date = {2022-01-19},
booktitle = {Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP},
pages = {281-296},
publisher = {Association for Computational Linguistics},
address = {Abu Dhabi, United Arab Emirates (Hybrid), Dec 2022},
abstract = {Recent work has shown that neural feature- and representation-learning, e.g. BERT, achieves superior performance over traditional manual feature engineering based approaches, with e.g. SVMs, in translationese classification tasks. Previous research did not showĀ (i)Ā whether the difference is because of the features, the classifiers or both, andĀ (ii)Ā what the neural classifiers actually learn. To addressĀ (i), we carefully design experiments that swap features between BERT- and SVM-based classifiers. We show that an SVM fed with BERT representations performs at the level of the best BERT classifiers, while BERT learning and using handcrafted features performs at the level of an SVM using handcrafted features. This shows that the performance differences are due to the features. To addressĀ (ii)Ā we use integrated gradients and find thatĀ (a)Ā there is indication that information captured by hand-crafted features is only a subset of what BERT learns, andĀ (b)Ā part of BERT's top performance results are due to BERT learning topic differences and spurious correlations with translationese.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Amponsah-Kaakyire, Kwabena; Pylypenko, Daria; EspaƱa-Bonet, Cristina; van Genabith, Josef

Do not Rely on Relay Translations: Multilingual Parallel Direct Europarl Inproceedings

Proceedings of the Workshop on Modelling Translation: Translatology in the Digital Age (MoTra21), International Committee on Computational Linguistics, pp. 1-7, Iceland (Online), 2021.

Translationese data is a scarce and valuable resource. Traditionally, the proceedings of the European Parliament have been used for studying translationese phenomena since their metadata allows to distinguish between original and translated texts. However, translations are not always direct and we hypothesise that a pivot (also called ā€relayā€) language might alter the conclusions on translationese effects. In this work, we (i) isolate translations that have been done without an intermediate language in the Europarl proceedings from those that might have used a pivot language, and (ii) build comparable and parallel corpora with data aligned across multiple languages that therefore can be used for both machine translation and translation studies.

@inproceedings{AmposahEtal:MOTRA:2021,
title = {Do not Rely on Relay Translations: Multilingual Parallel Direct Europarl},
author = {Kwabena Amponsah-Kaakyire and Daria Pylypenko and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
url = {https://aclanthology.org/2021.motra-1.1/},
year = {2021},
date = {2021},
booktitle = {Proceedings of the Workshop on Modelling Translation: Translatology in the Digital Age (MoTra21)},
pages = {1-7},
publisher = {International Committee on Computational Linguistics},
address = {Iceland (Online)},
abstract = {Translationese data is a scarce and valuable resource. Traditionally, the proceedings of the European Parliament have been used for studying translationese phenomena since their metadata allows to distinguish between original and translated texts. However, translations are not always direct and we hypothesise that a pivot (also called ā€relayā€) language might alter the conclusions on translationese effects. In this work, we (i) isolate translations that have been done without an intermediate language in the Europarl proceedings from those that might have used a pivot language, and (ii) build comparable and parallel corpora with data aligned across multiple languages that therefore can be used for both machine translation and translation studies.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Pylypenko, Daria; Amponsah-Kaakyire, Kwabena; Dutta Chowdhury, Koel; van Genabith, Josef; EspaƱa-Bonet, Cristina

Comparing Feature-Engineering and Feature-Learning Approaches for Multilingual Translationese Classification Inproceedings

Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, pp. 8596ā€“8611, Online and Punta Cana, Dominican Republic, 2021.

Traditional hand-crafted linguistically-informed features have often been used for distinguishing between translated and original non-translated texts. By contrast, to date, neural architectures without manual feature engineering have been less explored for this task. In this work, we (i) compare the traditional feature-engineering-based approach to the feature-learning-based one and (ii) analyse the neural architectures in order to investigate how well the hand-crafted features explain the variance in the neural modelsā€™ predictions. We use pre-trained neural word embeddings, as well as several end-to-end neural architectures in both monolingual and multilingual settings and compare them to feature-engineering-based SVM classifiers. We show that (i) neural architectures outperform other approaches by more than 20 accuracy points, with the BERT-based model performing the best in both the monolingual and multilingual settings; (ii) while many individual hand-crafted translationese features correlate with neural model predictions, feature importance analysis shows that the most important features for neural and classical architectures differ; and (iii) our multilingual experiments provide empirical evidence for translationese universals across languages.

@inproceedings{pylypenko-etal-2021-comparing,
title = {Comparing Feature-Engineering and Feature-Learning Approaches for Multilingual Translationese Classification},
author = {Daria Pylypenko and Kwabena Amponsah-Kaakyire and Koel Dutta Chowdhury and Josef van Genabith and Cristina Espa{\~n}a-Bonet},
url = {https://aclanthology.org/2021.emnlp-main.676/},
doi = {https://doi.org/10.18653/v1/2021.emnlp-main.676},
year = {2021},
date = {2021},
booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
pages = {8596ā€“8611},
publisher = {Association for Computational Linguistics},
address = {Online and Punta Cana, Dominican Republic},
abstract = {Traditional hand-crafted linguistically-informed features have often been used for distinguishing between translated and original non-translated texts. By contrast, to date, neural architectures without manual feature engineering have been less explored for this task. In this work, we (i) compare the traditional feature-engineering-based approach to the feature-learning-based one and (ii) analyse the neural architectures in order to investigate how well the hand-crafted features explain the variance in the neural modelsā€™ predictions. We use pre-trained neural word embeddings, as well as several end-to-end neural architectures in both monolingual and multilingual settings and compare them to feature-engineering-based SVM classifiers. We show that (i) neural architectures outperform other approaches by more than 20 accuracy points, with the BERT-based model performing the best in both the monolingual and multilingual settings; (ii) while many individual hand-crafted translationese features correlate with neural model predictions, feature importance analysis shows that the most important features for neural and classical architectures differ; and (iii) our multilingual experiments provide empirical evidence for translationese universals across languages.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Dutta Chowdhury, Koel; EspaƱa-Bonet, Cristina; van Genabith, Josef

Tracing Source Language Interference in Translation with Graph-Isomorphism Measures Inproceedings

Proceedings of Recent Advances in Natural Language Processing (RANLP 2021), pp. 380-390, Online, 2021, ISSN 2603-2813.

Previous research has used linguistic features to show that translations exhibit traces of source language interference and that phylogenetic trees between languages can be reconstructed from the results of translations into the same language. Recent research has shown that instances of translationese (source language interference) can even be detected in embedding spaces, comparing embeddings spaces of original language data with embedding spaces resulting from translations into the same language, using a simple Eigenvectorbased divergence from isomorphism measure. To date, it remains an open question whether alternative graph-isomorphism measures can produce better results. In this paper, we (i) explore Gromov-Hausdorff distance, (ii) present a novel spectral version of the Eigenvectorbased method, and (iii) evaluate all approaches against a broad linguistic typological database (URIEL). We show that language distances resulting from our spectral isomorphism approaches can reproduce genetic trees on a par with previous work without requiring any explicit linguistic information and that the results can be extended to non-Indo-European languages. Finally, we show that the methods are robust under a variety of modeling conditions.

@inproceedings{Chowdhury2021tracing,
title = {Tracing Source Language Interference in Translation with Graph-Isomorphism Measures},
author = {Koel Dutta Chowdhury and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
url = {https://aclanthology.org/2021.ranlp-1.43/},
year = {2021},
date = {2021},
booktitle = {Proceedings of Recent Advances in Natural Language Processing (RANLP 2021)},
issn = {2603-2813},
pages = {380-390},
address = {Online},
abstract = {Previous research has used linguistic features to show that translations exhibit traces of source language interference and that phylogenetic trees between languages can be reconstructed from the results of translations into the same language. Recent research has shown that instances of translationese (source language interference) can even be detected in embedding spaces, comparing embeddings spaces of original language data with embedding spaces resulting from translations into the same language, using a simple Eigenvectorbased divergence from isomorphism measure. To date, it remains an open question whether alternative graph-isomorphism measures can produce better results. In this paper, we (i) explore Gromov-Hausdorff distance, (ii) present a novel spectral version of the Eigenvectorbased method, and (iii) evaluate all approaches against a broad linguistic typological database (URIEL). We show that language distances resulting from our spectral isomorphism approaches can reproduce genetic trees on a par with previous work without requiring any explicit linguistic information and that the results can be extended to non-Indo-European languages. Finally, we show that the methods are robust under a variety of modeling conditions.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Bizzoni, Yuri; Juzek, Tom; EspaƱa-Bonet, Cristina; Dutta Chowdhury, Koel; van Genabith, Josef; Teich, Elke

How Human is Machine Translationese? Comparing Human and Machine Translations of Text and Speech Inproceedings

The 17th International Workshop on Spoken Language Translation, Seattle, WA, United States, 2020.

Translationese is a phenomenon present in human translations, simultaneous interpreting, and even machine translations. Some translationese features tend to appear in simultaneous interpreting with higher frequency than in human text translation, but the reasons for this are unclear. This study analyzes translationese patterns in translation, interpreting, and machine translation outputs in order to explore possible reasons. In our analysis we (i) detail two non-invasive ways of detecting translationese and (ii) compare translationese across human and machine translations from text and speech. We find that machine translation shows traces of translationese, but does not reproduce the patterns found in human translation, offering support to the hypothesis that such patterns are due to the model (human vs. machine) rather than to the data (written vs. spoken).

@inproceedings{Bizzoni2020,
title = {How Human is Machine Translationese? Comparing Human and Machine Translations of Text and Speech},
author = {Yuri Bizzoni and Tom Juzek and Cristina Espa{\~n}a-Bonet and Koel Dutta Chowdhury and Josef van Genabith and Elke Teich},
url = {https://aclanthology.org/2020.iwslt-1.34/},
doi = {https://doi.org/10.18653/v1/2020.iwslt-1.34},
year = {2020},
date = {2020},
booktitle = {The 17th International Workshop on Spoken Language Translation},
address = {Seattle, WA, United States},
abstract = {Translationese is a phenomenon present in human translations, simultaneous interpreting, and even machine translations. Some translationese features tend to appear in simultaneous interpreting with higher frequency than in human text translation, but the reasons for this are unclear. This study analyzes translationese patterns in translation, interpreting, and machine translation outputs in order to explore possible reasons. In our analysis we (i) detail two non-invasive ways of detecting translationese and (ii) compare translationese across human and machine translations from text and speech. We find that machine translation shows traces of translationese, but does not reproduce the patterns found in human translation, offering support to the hypothesis that such patterns are due to the model (human vs. machine) rather than to the data (written vs. spoken).},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Projects:   B6 B7

Dutta Chowdhury, Koel; EspaƱa-Bonet, Cristina; van Genabith, Josef

Understanding Translationese in Multi-view Embedding Spaces Inproceedings

Proceedings of the 28th International Conference on Computational Linguistics, International Committee on Computational Linguistics, pp. 6056-6062, Barcelona, Catalonia (Online), 2020.

Recent studies use a combination of lexical and syntactic features to show that footprints of the source language remain visible in translations, to the extent that it is possible to predict the original source language from the translation. In this paper, we focus on embedding-based semantic spaces, exploiting departures from isomorphism between spaces built from original target language and translations into this target language to predict relations between languages in an unsupervised way. We use different views of the data {—} words, parts of speech, semantic tags and synsets {—} to track translationese. Our analysis shows that (i) semantic distances between original target language and translations into this target language can be detected using the notion of isomorphism, (ii) language family ties with characteristics similar to linguistically motivated phylogenetic trees can be inferred from the distances and (iii) with delexicalised embeddings exhibiting source-language interference most significantly, other levels of abstraction display the same tendency, indicating the lexicalised results to be not ā€œjustā€œ due to possible topic differences between original and translated texts. To the best of our knowledge, this is the first time departures from isomorphism between embedding spaces are used to track translationese.

@inproceedings{DuttaEtal:COLING:2020,
title = {Understanding Translationese in Multi-view Embedding Spaces},
author = {Koel Dutta Chowdhury and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
url = {https://www.aclweb.org/anthology/2020.coling-main.532/},
doi = {https://doi.org/10.18653/v1/2020.coling-main.532},
year = {2020},
date = {2020},
booktitle = {Proceedings of the 28th International Conference on Computational Linguistics},
pages = {6056-6062},
publisher = {International Committee on Computational Linguistics},
address = {Barcelona, Catalonia (Online)},
abstract = {Recent studies use a combination of lexical and syntactic features to show that footprints of the source language remain visible in translations, to the extent that it is possible to predict the original source language from the translation. In this paper, we focus on embedding-based semantic spaces, exploiting departures from isomorphism between spaces built from original target language and translations into this target language to predict relations between languages in an unsupervised way. We use different views of the data {---} words, parts of speech, semantic tags and synsets {---} to track translationese. Our analysis shows that (i) semantic distances between original target language and translations into this target language can be detected using the notion of isomorphism, (ii) language family ties with characteristics similar to linguistically motivated phylogenetic trees can be inferred from the distances and (iii) with delexicalised embeddings exhibiting source-language interference most significantly, other levels of abstraction display the same tendency, indicating the lexicalised results to be not ā€œjustā€œ due to possible topic differences between original and translated texts. To the best of our knowledge, this is the first time departures from isomorphism between embedding spaces are used to track translationese.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

van Genabith, Josef; EspaƱa-Bonet, Cristina; Lapshinova-Koltunski, Ekaterina

Analysing Coreference in Transformer Outputs Inproceedings

Proceedings of the Fourth Workshop on Discourse in Machine Translation (DiscoMT 2019), Association for Computational Linguistics, pp. 1-12, Hong Kong, China, 2019.

We analyse coreference phenomena in three neural machine translation systems trained with different data settings with or without access to explicit intra- and cross-sentential anaphoric information. We compare system performance on two different genres: news and TED talks. To do this, we manually annotate (the possibly incorrect) coreference chains in the MT outputs and evaluate the coreference chain translations. We define an error typology that aims to go further than pronoun translation adequacy and includes types such as incorrect word selection or missing words. The features of coreference chains in automatic translations are also compared to those of the source texts and human translations. The analysis shows stronger potential translationese effects in machine translated outputs than in human translations.

@inproceedings{lapshinovaEtal:2019iscoMT,
title = {Analysing Coreference in Transformer Outputs},
author = {Josef van Genabith and Cristina Espa{\~n}a-Bonet andEkaterina Lapshinova-Koltunski},
url = {https://www.aclweb.org/anthology/D19-6501},
doi = {https://doi.org/10.18653/v1/D19-6501},
year = {2019},
date = {2019},
booktitle = {Proceedings of the Fourth Workshop on Discourse in Machine Translation (DiscoMT 2019)},
pages = {1-12},
publisher = {Association for Computational Linguistics},
address = {Hong Kong, China},
abstract = {We analyse coreference phenomena in three neural machine translation systems trained with different data settings with or without access to explicit intra- and cross-sentential anaphoric information. We compare system performance on two different genres: news and TED talks. To do this, we manually annotate (the possibly incorrect) coreference chains in the MT outputs and evaluate the coreference chain translations. We define an error typology that aims to go further than pronoun translation adequacy and includes types such as incorrect word selection or missing words. The features of coreference chains in automatic translations are also compared to those of the source texts and human translations. The analysis shows stronger potential translationese effects in machine translated outputs than in human translations.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Rubino, Raphael; Degaetano-Ortlieb, Stefania; Teich, Elke; van Genabith, Josef

Modeling Diachronic Change in Scientific Writing with Information Density Inproceedings

Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers, The COLING 2016 Organizing Committee, pp. 750-761, Osaka, Japan, 2016.

Previous linguistic research on scientific writing has shown that language use in the scientific domain varies considerably in register and style over time. In this paper we investigate the introduction of information theory inspired features to study long term diachronic change on three levels: lexis, part-of-speech and syntax. Our approach is based on distinguishing between sentences from 19th and 20th century scientific abstracts using supervised classification models. To the best of our knowledge, the introduction of information theoretic features to this task is novel. We show that these features outperform more traditional features, such as token or character n-grams, while leading to more compact models. We present a detailed analysis of feature informativeness in order to gain a better understanding of diachronic change on different linguistic levels.

@inproceedings{C16-1072,
title = {Modeling Diachronic Change in Scientific Writing with Information Density},
author = {Raphael Rubino and Stefania Degaetano-Ortlieb and Elke Teich and Josef van Genabith},
url = {https://aclanthology.org/C16-1072},
year = {2016},
date = {2016},
booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
pages = {750-761},
publisher = {The COLING 2016 Organizing Committee},
address = {Osaka, Japan},
abstract = {Previous linguistic research on scientific writing has shown that language use in the scientific domain varies considerably in register and style over time. In this paper we investigate the introduction of information theory inspired features to study long term diachronic change on three levels: lexis, part-of-speech and syntax. Our approach is based on distinguishing between sentences from 19th and 20th century scientific abstracts using supervised classification models. To the best of our knowledge, the introduction of information theoretic features to this task is novel. We show that these features outperform more traditional features, such as token or character n-grams, while leading to more compact models. We present a detailed analysis of feature informativeness in order to gain a better understanding of diachronic change on different linguistic levels.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Rubino, Raphael; Lapshinova-Koltunski, Ekaterina; van Genabith, Josef

Information Density and Quality Estimation Features as Translationese Indicators for Human Translation Classification Inproceedings

Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, pp. 960-970, San Diego, California, 2016.

This paper introduces information density and machine translation quality estimation inspired features to automatically detect and classify human translated texts. We investigate two settings: discriminating between translations and comparable originally authored texts, and distinguishing two levels of translation professionalism. Our framework is based on delexicalised sentence-level dense feature vector representations combined with a supervised machine learning approach. The results show state-of-the-art performance for mixed-domain translationese detection with information density and quality estimation based features, while results on translation expertise classification are mixed.

@inproceedings{N16-1110,
title = {Information Density and Quality Estimation Features as Translationese Indicators for Human Translation Classification},
author = {Raphael Rubino and Ekaterina Lapshinova-Koltunski and Josef van Genabith},
url = {http://aclweb.org/anthology/N16-1110},
doi = {https://doi.org/10.18653/v1/N16-1110},
year = {2016},
date = {2016},
booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
pages = {960-970},
publisher = {Association for Computational Linguistics},
address = {San Diego, California},
abstract = {This paper introduces information density and machine translation quality estimation inspired features to automatically detect and classify human translated texts. We investigate two settings: discriminating between translations and comparable originally authored texts, and distinguishing two levels of translation professionalism. Our framework is based on delexicalised sentence-level dense feature vector representations combined with a supervised machine learning approach. The results show state-of-the-art performance for mixed-domain translationese detection with information density and quality estimation based features, while results on translation expertise classification are mixed.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Findings of the 2016 Conference on Machine Translation Inproceedings

Proceedings of the First Conference on Machine Translation, Association for Computational Linguistics, pp. 131-198, Berlin, Germany, 2016.

This paper presents the results of the WMT16 shared tasks, which included five machine translation (MT) tasks (standard news, IT-domain, biomedical, multimodal, pronoun), three evaluation tasks (metrics, tuning, run-time estimation of MT quality), and an automatic post-editing task and bilingual document alignment task. This year, 102 MT systems from 24 institutions (plus 36 anonymized online systems) were submitted to the 12 translation directions in the news translation task. The IT-domain task received 31 submissions from 12 institutions in 7 directions and the Biomedical task received 15 submissions systems from 5 institutions. Evaluation was both automatic and manual (relative ranking and 100-point scale assessments). The quality estimation task had three subtasks, with a total of 14 teams, submitting 39 entries. The automatic post-editing task had a total of 6 teams, submitting 11 entries.

@inproceedings{bojar-EtAl:2016:WMT1,
title = {Findings of the 2016 Conference on Machine Translation},
author = {Ondvrej Bojar and Rajen Chatterjee and Christian Federmann and Yvette Graham and Barry Haddow and Matthias Huck and Antonio Jimeno Yepes and Philipp Koehn and Varvara Logacheva and Christof Monz and Matteo Negri and Aurelie Neveol and Mariana Neves and Martin Popel and Matt Post and Raphael Rubino and Carolina Scarton and Lucia Specia and Marco Turchi and Karin Verspoor and Marcos Zampieri},
url = {http://www.aclweb.org/anthology/W/W16/W16-2301},
year = {2016},
date = {2016-08-01},
booktitle = {Proceedings of the First Conference on Machine Translation},
pages = {131-198},
publisher = {Association for Computational Linguistics},
address = {Berlin, Germany},
abstract = {This paper presents the results of the WMT16 shared tasks, which included five machine translation (MT) tasks (standard news, IT-domain, biomedical, multimodal, pronoun), three evaluation tasks (metrics, tuning, run-time estimation of MT quality), and an automatic post-editing task and bilingual document alignment task. This year, 102 MT systems from 24 institutions (plus 36 anonymized online systems) were submitted to the 12 translation directions in the news translation task. The IT-domain task received 31 submissions from 12 institutions in 7 directions and the Biomedical task received 15 submissions systems from 5 institutions. Evaluation was both automatic and manual (relative ranking and 100-point scale assessments). The quality estimation task had three subtasks, with a total of 14 teams, submitting 39 entries. The automatic post-editing task had a total of 6 teams, submitting 11 entries.},
pubstate = {published},
type = {inproceedings}
}

Copy BibTeX to Clipboard

Project:   B6

Successfully