@inproceedings{zhang-etal-2022-mcse, title = {MCSE: Multimodal Contrastive Learning of Sentence Embeddings}, author = {Miaoran Zhang and Marius Mosbach and David Adelani and Michael Hedderich and Dietrich Klakow}, url = {https://aclanthology.org/2022.naacl-main.436}, doi = {https://doi.org/10.18653/v1/2022.naacl-main.436}, year = {2022}, date = {2022}, booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, pages = {5959-5969}, publisher = {Association for Computational Linguistics}, address = {Seattle, United States}, abstract = {Learning semantically meaningful sentence embeddings is an open problem in natural language processing. In this work, we propose a sentence embedding learning approach that exploits both visual and textual information via a multimodal contrastive objective. Through experiments on a variety of semantic textual similarity tasks, we demonstrate that our approach consistently improves the performance across various datasets and pre-trained encoders. In particular, combining a small amount of multimodal data with a large text-only corpus, we improve the state-of-the-art average Spearman{'}s correlation by 1.7{\%}. By analyzing the properties of the textual embedding space, we show that our model excels in aligning semantically similar sentences, providing an explanation for its improved performance.}, pubstate = {published}, type = {inproceedings} }