@inproceedings{tan-etal-2026-flores,
title = {When Flores Bloomz Wrong: Cross-Direction Contamination in Machine Translation Evaluation},
author = {David Tan and Pinzhen Chen and Josef van Genabith and Koel Dutta Chowdhury},
editor = {Vera Demberg and Kentaro Inui and Llu{\'i}s Marquez},
url = {https://aclanthology.org/2026.eacl-short.26/},
doi = {https://doi.org/10.18653/v1/2026.eacl-short.26},
year = {2026},
date = {2026},
booktitle = {Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)},
isbn = {979-8-89176-381-4},
pages = {345-358},
publisher = {Association for Computational Linguistics},
address = {Rabat, Morocco},
abstract = {Large language models (LLMs) can be benchmark-contaminated, resulting in inflated scores that mask memorization as generalization, and in multilingual settings, this memorization can even transfer to "uncontaminated" languages. Using the FLORES-200 translation benchmark as a diagnostic, we study two 7-8B instruction-tuned multilingual LLMs: Bloomz, which was trained on FLORES, and Llama as an uncontaminated control. We confirm Bloomz’s FLORES contamination and demonstrate that machine translation contamination can be cross-directional, artificially boosting performance in unseen translation directions due to target-side memorization. Further analysis shows that recall of memorized references often persists despite various source-side perturbation efforts like paraphrasing and named entity replacement. However, replacing named entities leads to a consistent decrease in BLEU, suggesting an effective probing method for memorization in contaminated models.
<dl></dl>},
pubstate = {published},
type = {inproceedings}
}