Common recurrent neural network architectures scale poorly due to the
intrinsic difficulty in parallelizing their state computations. In this work,
we propose the Simple Recurrent Unit (SRU) architecture, a recurrent unit that
simplifies the computation and exposes more parallelism. In SRU, the majority
of computation for each step is independent of the recurrence and can be easily
parallelized. SRU is as fast as a convolutional layer and 5-10x faster than an
optimized LSTM implementation. We study SRUs on a wide range of applications,
including classification, question answering, language modeling, translation
and speech recognition. Our experiments demonstrate the effectiveness of SRU
and the trade-off it enables between speed and performance. We open source our
implementation in PyTorch and CNTK.
%0 Generic
%1 lei2017training
%A Lei, Tao
%A Zhang, Yu
%A Artzi, Yoav
%D 2017
%K cnn deep_learning optimization rnn
%T Training RNNs as Fast as CNNs
%U http://arxiv.org/abs/1709.02755
%X Common recurrent neural network architectures scale poorly due to the
intrinsic difficulty in parallelizing their state computations. In this work,
we propose the Simple Recurrent Unit (SRU) architecture, a recurrent unit that
simplifies the computation and exposes more parallelism. In SRU, the majority
of computation for each step is independent of the recurrence and can be easily
parallelized. SRU is as fast as a convolutional layer and 5-10x faster than an
optimized LSTM implementation. We study SRUs on a wide range of applications,
including classification, question answering, language modeling, translation
and speech recognition. Our experiments demonstrate the effectiveness of SRU
and the trade-off it enables between speed and performance. We open source our
implementation in PyTorch and CNTK.
@misc{lei2017training,
abstract = {Common recurrent neural network architectures scale poorly due to the
intrinsic difficulty in parallelizing their state computations. In this work,
we propose the Simple Recurrent Unit (SRU) architecture, a recurrent unit that
simplifies the computation and exposes more parallelism. In SRU, the majority
of computation for each step is independent of the recurrence and can be easily
parallelized. SRU is as fast as a convolutional layer and 5-10x faster than an
optimized LSTM implementation. We study SRUs on a wide range of applications,
including classification, question answering, language modeling, translation
and speech recognition. Our experiments demonstrate the effectiveness of SRU
and the trade-off it enables between speed and performance. We open source our
implementation in PyTorch and CNTK.},
added-at = {2017-12-13T21:02:39.000+0100},
author = {Lei, Tao and Zhang, Yu and Artzi, Yoav},
biburl = {https://www.bibsonomy.org/bibtex/29d1fcb5d735cfa12c503c0e6f1358653/dallmann},
description = {Training RNNs as Fast as CNNs},
interhash = {ee55ccbff9845951a2f045aebed66956},
intrahash = {9d1fcb5d735cfa12c503c0e6f1358653},
keywords = {cnn deep_learning optimization rnn},
note = {cite arxiv:1709.02755Comment: submission version},
timestamp = {2017-12-13T21:02:39.000+0100},
title = {Training RNNs as Fast as CNNs},
url = {http://arxiv.org/abs/1709.02755},
year = 2017
}