@phdthesis{Greenberg_Diss, title = {Evaluating humanness in language models}, author = {Clayton Greenberg}, url = {https://jahrbib.sulb.uni-saarland.de/handle/20.500.11880/37534}, doi = {https://doi.org/10.22028/D291-41943}, year = {2024}, date = {2024}, school = {Saarland University}, publisher = {Saarl{\"a}ndische Universit{\"a}ts- und Landesbibliothek}, address = {Saarbruecken, Germany}, abstract = {Advances with language models, systems that predict upcoming words in context, have enabled an era in which people sometimes cannot distinguish between human-written and artificially created text. Perplexity, the simplest and most popular way to evaluate the quality of a language model, rewards any pattern captured by the system as long as it robustly constrains the upcoming possibilities. By capturing patterns that humans do not use, optimizing a language model for minimal perplexity could trigger a divergence between the most probable text and the most human-like text. In this thesis, I argue that this divergence has happened for state-of-the-art language models. Part I characterizes the kinds of knowledge captured by language models. First, I present three novel language model architectures whose neural connections were inspired by human behavior. Then, I discuss novel morphology- and sentiment-based paradigms that capture human knowledge quantitatively. Part II establishes several methods for evaluating language models by comparison against human behavior measures. I consider the suitability and potential confounds for offline ratings and two paradigms of online reading times: eye-tracking and G-Maze. Then, I use a novel dataset of G-Maze response times to show computational and linguistic evidence of the divergence.