The capacity of a neural network to absorb information is limited by its
number of parameters. Conditional computation, where parts of the network are
active on a per-example basis, has been proposed in theory as a way of
dramatically increasing model capacity without a proportional increase in
computation. In practice, however, there are significant algorithmic and
performance challenges. In this work, we address these challenges and finally
realize the promise of conditional computation, achieving greater than 1000x
improvements in model capacity with only minor losses in computational
efficiency on modern GPU clusters. We introduce a Sparsely-Gated
Mixture-of-Experts layer (MoE), consisting of up to thousands of feed-forward
sub-networks. A trainable gating network determines a sparse combination of
these experts to use for each example. We apply the MoE to the tasks of
language modeling and machine translation, where model capacity is critical for
absorbing the vast quantities of knowledge available in the training corpora.
We present model architectures in which a MoE with up to 137 billion parameters
is applied convolutionally between stacked LSTM layers. On large language
modeling and machine translation benchmarks, these models achieve significantly
better results than state-of-the-art at lower computational cost.
Description
[1701.06538] Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer
%0 Generic
%1 shazeer2017outrageously
%A Shazeer, Noam
%A Mirhoseini, Azalia
%A Maziarz, Krzysztof
%A Davis, Andy
%A Le, Quoc
%A Hinton, Geoffrey
%A Dean, Jeff
%D 2017
%K large_model
%T Outrageously Large Neural Networks: The Sparsely-Gated
Mixture-of-Experts Layer
%U http://arxiv.org/abs/1701.06538
%X The capacity of a neural network to absorb information is limited by its
number of parameters. Conditional computation, where parts of the network are
active on a per-example basis, has been proposed in theory as a way of
dramatically increasing model capacity without a proportional increase in
computation. In practice, however, there are significant algorithmic and
performance challenges. In this work, we address these challenges and finally
realize the promise of conditional computation, achieving greater than 1000x
improvements in model capacity with only minor losses in computational
efficiency on modern GPU clusters. We introduce a Sparsely-Gated
Mixture-of-Experts layer (MoE), consisting of up to thousands of feed-forward
sub-networks. A trainable gating network determines a sparse combination of
these experts to use for each example. We apply the MoE to the tasks of
language modeling and machine translation, where model capacity is critical for
absorbing the vast quantities of knowledge available in the training corpora.
We present model architectures in which a MoE with up to 137 billion parameters
is applied convolutionally between stacked LSTM layers. On large language
modeling and machine translation benchmarks, these models achieve significantly
better results than state-of-the-art at lower computational cost.
@misc{shazeer2017outrageously,
abstract = {The capacity of a neural network to absorb information is limited by its
number of parameters. Conditional computation, where parts of the network are
active on a per-example basis, has been proposed in theory as a way of
dramatically increasing model capacity without a proportional increase in
computation. In practice, however, there are significant algorithmic and
performance challenges. In this work, we address these challenges and finally
realize the promise of conditional computation, achieving greater than 1000x
improvements in model capacity with only minor losses in computational
efficiency on modern GPU clusters. We introduce a Sparsely-Gated
Mixture-of-Experts layer (MoE), consisting of up to thousands of feed-forward
sub-networks. A trainable gating network determines a sparse combination of
these experts to use for each example. We apply the MoE to the tasks of
language modeling and machine translation, where model capacity is critical for
absorbing the vast quantities of knowledge available in the training corpora.
We present model architectures in which a MoE with up to 137 billion parameters
is applied convolutionally between stacked LSTM layers. On large language
modeling and machine translation benchmarks, these models achieve significantly
better results than state-of-the-art at lower computational cost.},
added-at = {2019-05-22T15:59:00.000+0200},
author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
biburl = {https://www.bibsonomy.org/bibtex/2bb169ae36d579afa7dbe9bb9451271dc/straybird321},
description = {[1701.06538] Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
interhash = {cc05819387ba513c5f22be3772b2e4f6},
intrahash = {bb169ae36d579afa7dbe9bb9451271dc},
keywords = {large_model},
note = {cite arxiv:1701.06538},
timestamp = {2019-05-22T15:59:00.000+0200},
title = {Outrageously Large Neural Networks: The Sparsely-Gated
Mixture-of-Experts Layer},
url = {http://arxiv.org/abs/1701.06538},
year = 2017
}