Very deep convolutional networks with hundreds of layers have led to
significant reductions in error on competitive benchmarks. Although the
unmatched expressiveness of the many layers can be highly desirable at test
time, training very deep networks comes with its own set of challenges. The
gradients can vanish, the forward flow often diminishes, and the training time
can be painfully slow. To address these problems, we propose stochastic depth,
a training procedure that enables the seemingly contradictory setup to train
short networks and use deep networks at test time. We start with very deep
networks but during training, for each mini-batch, randomly drop a subset of
layers and bypass them with the identity function. This simple approach
complements the recent success of residual networks. It reduces training time
substantially and improves the test error significantly on almost all data sets
that we used for evaluation. With stochastic depth we can increase the depth of
residual networks even beyond 1200 layers and still yield meaningful
improvements in test error (4.91% on CIFAR-10).
%0 Generic
%1 huang2016networks
%A Huang, Gao
%A Sun, Yu
%A Liu, Zhuang
%A Sedra, Daniel
%A Weinberger, Kilian
%D 2016
%K 2016 arxiv cornell deep-learning
%T Deep Networks with Stochastic Depth
%U http://arxiv.org/abs/1603.09382
%X Very deep convolutional networks with hundreds of layers have led to
significant reductions in error on competitive benchmarks. Although the
unmatched expressiveness of the many layers can be highly desirable at test
time, training very deep networks comes with its own set of challenges. The
gradients can vanish, the forward flow often diminishes, and the training time
can be painfully slow. To address these problems, we propose stochastic depth,
a training procedure that enables the seemingly contradictory setup to train
short networks and use deep networks at test time. We start with very deep
networks but during training, for each mini-batch, randomly drop a subset of
layers and bypass them with the identity function. This simple approach
complements the recent success of residual networks. It reduces training time
substantially and improves the test error significantly on almost all data sets
that we used for evaluation. With stochastic depth we can increase the depth of
residual networks even beyond 1200 layers and still yield meaningful
improvements in test error (4.91% on CIFAR-10).
@misc{huang2016networks,
abstract = {Very deep convolutional networks with hundreds of layers have led to
significant reductions in error on competitive benchmarks. Although the
unmatched expressiveness of the many layers can be highly desirable at test
time, training very deep networks comes with its own set of challenges. The
gradients can vanish, the forward flow often diminishes, and the training time
can be painfully slow. To address these problems, we propose stochastic depth,
a training procedure that enables the seemingly contradictory setup to train
short networks and use deep networks at test time. We start with very deep
networks but during training, for each mini-batch, randomly drop a subset of
layers and bypass them with the identity function. This simple approach
complements the recent success of residual networks. It reduces training time
substantially and improves the test error significantly on almost all data sets
that we used for evaluation. With stochastic depth we can increase the depth of
residual networks even beyond 1200 layers and still yield meaningful
improvements in test error (4.91% on CIFAR-10).},
added-at = {2018-05-01T19:46:25.000+0200},
author = {Huang, Gao and Sun, Yu and Liu, Zhuang and Sedra, Daniel and Weinberger, Kilian},
biburl = {https://www.bibsonomy.org/bibtex/2e279e707bdf836ed7d4a986af8f4736a/achakraborty},
description = {[1603.09382] Deep Networks with Stochastic Depth},
interhash = {74d045b492f03278bf26c7ccf59b4dea},
intrahash = {e279e707bdf836ed7d4a986af8f4736a},
keywords = {2016 arxiv cornell deep-learning},
note = {cite arxiv:1603.09382Comment: first two authors contributed equally},
timestamp = {2018-05-01T19:46:25.000+0200},
title = {Deep Networks with Stochastic Depth},
url = {http://arxiv.org/abs/1603.09382},
year = 2016
}