Fine-tuning pre-trained language models like BERT has become an effective way
in NLP and yields state-of-the-art results on many downstream tasks. Recent
studies on adapting BERT to new tasks mainly focus on modifying the model
structure, re-designing the pre-train tasks, and leveraging external data and
knowledge. The fine-tuning strategy itself has yet to be fully explored. In
this paper, we improve the fine-tuning of BERT with two effective mechanisms:
self-ensemble and self-distillation. The experiments on text classification and
natural language inference tasks show our proposed methods can significantly
improve the adaption of BERT without any external data or knowledge.
%0 Generic
%1 xu2020improving
%A Xu, Yige
%A Qiu, Xipeng
%A Zhou, Ligao
%A Huang, Xuanjing
%D 2020
%K BERT fine-tuning
%T Improving BERT Fine-Tuning via Self-Ensemble and Self-Distillation
%U http://arxiv.org/abs/2002.10345
%X Fine-tuning pre-trained language models like BERT has become an effective way
in NLP and yields state-of-the-art results on many downstream tasks. Recent
studies on adapting BERT to new tasks mainly focus on modifying the model
structure, re-designing the pre-train tasks, and leveraging external data and
knowledge. The fine-tuning strategy itself has yet to be fully explored. In
this paper, we improve the fine-tuning of BERT with two effective mechanisms:
self-ensemble and self-distillation. The experiments on text classification and
natural language inference tasks show our proposed methods can significantly
improve the adaption of BERT without any external data or knowledge.
@misc{xu2020improving,
abstract = {Fine-tuning pre-trained language models like BERT has become an effective way
in NLP and yields state-of-the-art results on many downstream tasks. Recent
studies on adapting BERT to new tasks mainly focus on modifying the model
structure, re-designing the pre-train tasks, and leveraging external data and
knowledge. The fine-tuning strategy itself has yet to be fully explored. In
this paper, we improve the fine-tuning of BERT with two effective mechanisms:
self-ensemble and self-distillation. The experiments on text classification and
natural language inference tasks show our proposed methods can significantly
improve the adaption of BERT without any external data or knowledge.},
added-at = {2020-11-04T13:38:28.000+0100},
author = {Xu, Yige and Qiu, Xipeng and Zhou, Ligao and Huang, Xuanjing},
biburl = {https://www.bibsonomy.org/bibtex/26154b58da6b9c3a62bcf7df2c98c8a1c/parismic},
interhash = {e8e4203d363189cc4bad29ef196ac61b},
intrahash = {6154b58da6b9c3a62bcf7df2c98c8a1c},
keywords = {BERT fine-tuning},
note = {cite arxiv:2002.10345Comment: 7 pages, 6 figures},
timestamp = {2020-11-04T13:38:28.000+0100},
title = {Improving BERT Fine-Tuning via Self-Ensemble and Self-Distillation},
url = {http://arxiv.org/abs/2002.10345},
year = 2020
}