Mitigating Translationese with GPT-4: Strategies and Performance Inproceedings
Proceedings of the 25th Annual Conference of the European Association for Machine Translation, 1, European Association for Machine Translation, pp. 411–430, 2024.Translations differ in systematic ways from texts originally authored in the same language. These differences, collectively known as translationese, can pose challenges in cross-lingual natural language processing: models trained or tested on translated input might struggle when presented with non-translated language.Translationese mitigation can alleviate this problem. This study investigates the generative capacities of GPT-4 to reduce translationese in human-translated texts. The task is framed as a rewriting process aimed
at modified translations indistinguishable from the original text in the target language. Our focus is on prompt engineering that tests the utility of linguistic knowledge as part of the instruction for GPT-4. Through a series of prompt design experiments, we show that GPT4-generated revisions are more similar to originals in the target language when the prompts incorporate specific linguistic instructions instead of relying solely on the model’s internal knowledge. Furthermore, we release the segment-aligned bidirectional German–English data built from the Europarl corpus that underpins this study.
@inproceedings{kunilovskaya-etal-2024-mitigating,
title = {Mitigating Translationese with GPT-4: Strategies and Performance},
author = {Maria Kunilovskaya and Koel Dutta Chowdhury and Heike Przybyl and Cristina Espa{\~n}a-Bonet and Josef van Genabith},
url = {https://eamt2024.github.io/proceedings/vol1.pdf},
year = {2024},
date = {2024},
booktitle = {Proceedings of the 25th Annual Conference of the European Association for Machine Translation},
pages = {411–430},
publisher = {European Association for Machine Translation},
abstract = {Translations differ in systematic ways from texts originally authored in the same language. These differences, collectively known as translationese, can pose challenges in cross-lingual natural language processing: models trained or tested on translated input might struggle when presented with non-translated language.Translationese mitigation can alleviate this problem. This study investigates the generative capacities of GPT-4 to reduce translationese in human-translated texts. The task is framed as a rewriting process aimed
at modified translations indistinguishable from the original text in the target language. Our focus is on prompt engineering that tests the utility of linguistic knowledge as part of the instruction for GPT-4. Through a series of prompt design experiments, we show that GPT4-generated revisions are more similar to originals in the target language when the prompts incorporate specific linguistic instructions instead of relying solely on the model’s internal knowledge. Furthermore, we release the segment-aligned bidirectional German–English data built from the Europarl corpus that underpins this study.},
pubstate = {published},
type = {inproceedings}
}