Information Theory and Linguistic Variation: A Study of Brazilian and European Portuguese Inproceedings
Scherrer, Yves; Jauhiainen, Tommi; Ljubešić, Nikola; Nakov, Preslav; Tiedemann, Jorg; Zampieri, Marcos (Ed.): Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects, Association for Computational Linguistics, pp. 9-19, Abu Dhabi, UAE, 2025.We present a general analysis of the lexical and grammatical differences between Brazilian and European Portuguese by applying entropy measures, including Kullback-Leibler divergence and word order entropy, across various linguistic levels. Using a parallel corpus of BP and EP sentences translated from English, we quantified these differences and identified characteristic phenomena underlying the divergences between the two varieties. The highest divergence was observed at the lexical level due to word pairs unique to each variety but also related to grammatical distinctions. Furthermore, the analysis of parts-of-speech (POS), dependency relations, and POS tri-grams provided information concerning distinctive grammatical constructions. Finally, the word order entropy analysis revealed that while most of the syntactic features analysed showed similar patterns across BP and EP, specific word order preferences were still apparent.
@inproceedings{alves-2025-information,
title = {Information Theory and Linguistic Variation: A Study of Brazilian and European Portuguese},
author = {Diego Alves},
editor = {Yves Scherrer and Tommi Jauhiainen and Nikola Ljubešić and Preslav Nakov and Jorg Tiedemann and Marcos Zampieri},
url = {https://aclanthology.org/2025.vardial-1.2/},
year = {2025},
date = {2025},
booktitle = {Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects},
pages = {9-19},
publisher = {Association for Computational Linguistics},
address = {Abu Dhabi, UAE},
abstract = {We present a general analysis of the lexical and grammatical differences between Brazilian and European Portuguese by applying entropy measures, including Kullback-Leibler divergence and word order entropy, across various linguistic levels. Using a parallel corpus of BP and EP sentences translated from English, we quantified these differences and identified characteristic phenomena underlying the divergences between the two varieties. The highest divergence was observed at the lexical level due to word pairs unique to each variety but also related to grammatical distinctions. Furthermore, the analysis of parts-of-speech (POS), dependency relations, and POS tri-grams provided information concerning distinctive grammatical constructions. Finally, the word order entropy analysis revealed that while most of the syntactic features analysed showed similar patterns across BP and EP, specific word order preferences were still apparent.},
pubstate = {published},
type = {inproceedings}
}
Project: B1