@phdthesis{Hedderich_Diss_2022, title = {Weak supervision and label noise handling for Natural language processing in low-resource scenarios}, author = {Michael Hedderich}, url = {https://publikationen.sulb.uni-saarland.de/handle/20.500.11880/35026}, doi = {https://doi.org/10.22028/D291-38691}, year = {2022}, date = {2022}, school = {Saarland University}, address = {Saarbruecken, Germany}, abstract = {The lack of large amounts of labeled data is a significant factor blocking many low-resource languages and domains from catching up with recent advancements in natural language processing. To reduce this dependency on labeled instances, weak supervision (semi-)automatically annotates unlabeled data. These labels can be obtained more quickly and cheaply than manual, gold-standard annotations. They also, however, contain more errors. Handling these noisy labels is often required to leverage the weakly supervised data successfully. In this dissertation, we study the whole weak supervision pipeline with a focus on the task of named entity recognition. We develop a tool for automatic annotation, and we propose an approach to model label noise when a small amount of clean data is available. We study the factors that influence the noise model's quality from a theoretic perspective, and we validate this approach empirically on several different tasks and languages. An important aspect is the aim for a realistic evaluation. We perform our analysis, among others, on several African low-resource languages. We show the performance benefits that can be achieved using weak supervision and label noise modeling. But we also highlight open issues that the field still has to overcome. For the low-resource settings, we expand the analysis to few-shot learning. For classification errors, we present a novel approach to obtain interpretable insights of where classifiers fail.