Multilingual Pretrained Language Models (MPLMs) perform strongly in cross-lingual transfer. We propose Prompts Augmented by Retrieval Crosslingually (PARC) to improve zero-shot performance on low-resource languages (LRLs) by augmenting the context with prompts consisting of semantically similar sentences retrieved from a high-resource language (HRL). PARC improves zero-shot performance on three downstream tasks (sentiment classification, topic categorization, natural language inference) with multilingual parallel test sets across 10 LRLs covering 6 language families in unlabeled (+5.1\%) and labeled settings (+16.3\%). PARC also outperforms finetuning by 3.7\%. We find a significant positive correlation between cross-lingual transfer performance on one side, and the similarity between high- and low-resource languages as well as the amount of low-resource pretraining data on the other side. A robustness analysis suggests that PARC has the potential to achieve even stronger performance with more powerful MPLMs.
%0 Conference Paper
%1 nie-etal-2023-cross
%A Nie, Ercong
%A Liang, Sheng
%A Schmid, Helmut
%A Schütze, Hinrich
%B Findings of the Association for Computational Linguistics: ACL 2023
%C Toronto, Canada
%D 2023
%E Rogers, Anna
%E Boyd-Graber, Jordan
%E Okazaki, Naoaki
%I Association for Computational Linguistics
%K ak embedding nlp
%P 8320--8340
%R 10.18653/v1/2023.findings-acl.528
%T Cross-Lingual Retrieval Augmented Prompt for Low-Resource Languages
%U https://aclanthology.org/2023.findings-acl.528
%X Multilingual Pretrained Language Models (MPLMs) perform strongly in cross-lingual transfer. We propose Prompts Augmented by Retrieval Crosslingually (PARC) to improve zero-shot performance on low-resource languages (LRLs) by augmenting the context with prompts consisting of semantically similar sentences retrieved from a high-resource language (HRL). PARC improves zero-shot performance on three downstream tasks (sentiment classification, topic categorization, natural language inference) with multilingual parallel test sets across 10 LRLs covering 6 language families in unlabeled (+5.1\%) and labeled settings (+16.3\%). PARC also outperforms finetuning by 3.7\%. We find a significant positive correlation between cross-lingual transfer performance on one side, and the similarity between high- and low-resource languages as well as the amount of low-resource pretraining data on the other side. A robustness analysis suggests that PARC has the potential to achieve even stronger performance with more powerful MPLMs.
@inproceedings{nie-etal-2023-cross,
abstract = {Multilingual Pretrained Language Models (MPLMs) perform strongly in cross-lingual transfer. We propose Prompts Augmented by Retrieval Crosslingually (PARC) to improve zero-shot performance on low-resource languages (LRLs) by augmenting the context with prompts consisting of semantically similar sentences retrieved from a high-resource language (HRL). PARC improves zero-shot performance on three downstream tasks (sentiment classification, topic categorization, natural language inference) with multilingual parallel test sets across 10 LRLs covering 6 language families in unlabeled (+5.1{\%}) and labeled settings (+16.3{\%}). PARC also outperforms finetuning by 3.7{\%}. We find a significant positive correlation between cross-lingual transfer performance on one side, and the similarity between high- and low-resource languages as well as the amount of low-resource pretraining data on the other side. A robustness analysis suggests that PARC has the potential to achieve even stronger performance with more powerful MPLMs.},
added-at = {2023-11-09T22:26:56.000+0100},
address = {Toronto, Canada},
author = {Nie, Ercong and Liang, Sheng and Schmid, Helmut and Sch{\"u}tze, Hinrich},
biburl = {https://www.bibsonomy.org/bibtex/2e6eb2bb94f61593bce4b935ff0dfd35a/janpf},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2023},
doi = {10.18653/v1/2023.findings-acl.528},
editor = {Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki},
interhash = {312f3962c840a2292ad62a20d438462b},
intrahash = {e6eb2bb94f61593bce4b935ff0dfd35a},
keywords = {ak embedding nlp},
month = jul,
pages = {8320--8340},
publisher = {Association for Computational Linguistics},
timestamp = {2023-11-09T22:26:56.000+0100},
title = {Cross-Lingual Retrieval Augmented Prompt for Low-Resource Languages},
url = {https://aclanthology.org/2023.findings-acl.528},
year = 2023
}