Y. Chai, S. Jin, and X. Hou. Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, page 6887--6900. Online, Association for Computational Linguistics, (July 2020)
Abstract
Self-attention mechanisms have made striking state-of-the-art (SOTA) progress in various sequence learning tasks, standing on the multi-headed dot product attention by attending to all the global contexts at different locations. Through a pseudo information highway, we introduce a gated component self-dependency units (SDU) that incorporates LSTM-styled gating units to replenish internal semantic importance within the multi-dimensional latent space of individual representations. The subsidiary content-based SDU gates allow for the information flow of modulated latent embeddings through skipped connections, leading to a clear margin of convergence speed with gradient descent algorithms. We may unveil the role of gating mechanism to aid in the context-based Transformer modules, with hypothesizing that SDU gates, especially on shallow layers, could push it faster to step towards suboptimal points during the optimization process.
%0 Conference Paper
%1 chai-etal-2020-highway
%A Chai, Yekun
%A Jin, Shuo
%A Hou, Xinwen
%B Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%C Online
%D 2020
%I Association for Computational Linguistics
%K acl2020 highway neuralnet transformer
%P 6887--6900
%T Highway Transformer: Self-Gating Enhanced Self-Attentive Networks
%U https://www.aclweb.org/anthology/2020.acl-main.616
%X Self-attention mechanisms have made striking state-of-the-art (SOTA) progress in various sequence learning tasks, standing on the multi-headed dot product attention by attending to all the global contexts at different locations. Through a pseudo information highway, we introduce a gated component self-dependency units (SDU) that incorporates LSTM-styled gating units to replenish internal semantic importance within the multi-dimensional latent space of individual representations. The subsidiary content-based SDU gates allow for the information flow of modulated latent embeddings through skipped connections, leading to a clear margin of convergence speed with gradient descent algorithms. We may unveil the role of gating mechanism to aid in the context-based Transformer modules, with hypothesizing that SDU gates, especially on shallow layers, could push it faster to step towards suboptimal points during the optimization process.
@inproceedings{chai-etal-2020-highway,
abstract = {Self-attention mechanisms have made striking state-of-the-art (SOTA) progress in various sequence learning tasks, standing on the multi-headed dot product attention by attending to all the global contexts at different locations. Through a pseudo information highway, we introduce a gated component self-dependency units (SDU) that incorporates LSTM-styled gating units to replenish internal semantic importance within the multi-dimensional latent space of individual representations. The subsidiary content-based SDU gates allow for the information flow of modulated latent embeddings through skipped connections, leading to a clear margin of convergence speed with gradient descent algorithms. We may unveil the role of gating mechanism to aid in the context-based Transformer modules, with hypothesizing that SDU gates, especially on shallow layers, could push it faster to step towards suboptimal points during the optimization process.},
added-at = {2020-07-08T14:04:18.000+0200},
address = {Online},
author = {Chai, Yekun and Jin, Shuo and Hou, Xinwen},
biburl = {https://www.bibsonomy.org/bibtex/2cb7305c9b8978cdf8bee7493b38c00f5/albinzehe},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
interhash = {dc983ee86d78909720e50221569e71f5},
intrahash = {cb7305c9b8978cdf8bee7493b38c00f5},
keywords = {acl2020 highway neuralnet transformer},
month = jul,
pages = {6887--6900},
publisher = {Association for Computational Linguistics},
timestamp = {2020-07-08T14:04:18.000+0200},
title = {Highway Transformer: Self-Gating Enhanced Self-Attentive Networks},
url = {https://www.aclweb.org/anthology/2020.acl-main.616},
year = 2020
}