@inproceedings{57d85679277d4c3a904dbe344726de6e,
title = "Scheduled drophead: A regularization method for transformer models",
abstract = "We introduce DropHead, a structured dropout method specifically designed for regularizing the multi-head attention mechanism which is a key component of transformer. In contrast to the conventional dropout mechanism which randomly drops units or connections, DropHead drops entire attention heads during training to prevent the multi-head attention model from being dominated by a small portion of attention heads. It can help reduce the risk of overfitting and allow the models to better benefit from the multi-head attention. Given the interaction between multi-headedness and training dynamics, we further propose a novel dropout rate scheduler to adjust the dropout rate of DropHead throughout training, which results in a better regularization effect. Experimental results demonstrate that our proposed approach can improve transformer models by 0.9 BLEU score on WMT14 En-De translation task and around 1.0 accuracy for various text classification tasks.",
author = "Wangchunshu Zhou and Tao Ge and Ke Xu and Furu Wei and Ming Zhou",
note = "Publisher Copyright: {\textcopyright} 2020 Association for Computational Linguistics; Findings of the Association for Computational Linguistics, ACL 2020: EMNLP 2020 ; Conference date: 16-11-2020 Through 20-11-2020",
year = "2020",
language = "英语",
series = "Findings of the Association for Computational Linguistics Findings of ACL: EMNLP 2020",
publisher = "Association for Computational Linguistics (ACL)",
pages = "1971--1980",
booktitle = "Findings of the Association for Computational Linguistics Findings of ACL",
address = "澳大利亚",
}