Batch Normalization (BatchNorm) is a widely adopted technique that enables
faster and more stable training of deep neural networks (DNNs). Despite its
pervasiveness, the exact reasons for BatchNorm&\#39;s effectiveness are still poorly
understood. The popular belief is that this effectiveness stems from
controlling the change of the layers&\#39; input distributions during training to
reduce the so-called &\#34;internal covariate shift&\#34;. In this work, we demonstrate
that such distributional stability of layer inputs has little to do with the
success of BatchNorm. Instead, we uncover a more fundamental impact of
BatchNorm on the training process: it makes the optimization landscape
significantly smoother. This smoothness induces a more predictive and stable
behavior of the gradients, allowing for faster training.
%0 Generic
%1 citeulike:14610092
%A xxx,
%D 2018
%K arch regularization
%T How Does Batch Normalization Help Optimization?
%U http://arxiv.org/abs/1805.11604
%X Batch Normalization (BatchNorm) is a widely adopted technique that enables
faster and more stable training of deep neural networks (DNNs). Despite its
pervasiveness, the exact reasons for BatchNorm&\#39;s effectiveness are still poorly
understood. The popular belief is that this effectiveness stems from
controlling the change of the layers&\#39; input distributions during training to
reduce the so-called &\#34;internal covariate shift&\#34;. In this work, we demonstrate
that such distributional stability of layer inputs has little to do with the
success of BatchNorm. Instead, we uncover a more fundamental impact of
BatchNorm on the training process: it makes the optimization landscape
significantly smoother. This smoothness induces a more predictive and stable
behavior of the gradients, allowing for faster training.
@misc{citeulike:14610092,
abstract = {{ Batch Normalization (BatchNorm) is a widely adopted technique that enables
faster and more stable training of deep neural networks (DNNs). Despite its
pervasiveness, the exact reasons for BatchNorm\&\#39;s effectiveness are still poorly
understood. The popular belief is that this effectiveness stems from
controlling the change of the layers\&\#39; input distributions during training to
reduce the so-called \&\#34;internal covariate shift\&\#34;. In this work, we demonstrate
that such distributional stability of layer inputs has little to do with the
success of BatchNorm. Instead, we uncover a more fundamental impact of
BatchNorm on the training process: it makes the optimization landscape
significantly smoother. This smoothness induces a more predictive and stable
behavior of the gradients, allowing for faster training.}},
added-at = {2019-02-27T22:23:29.000+0100},
archiveprefix = {arXiv},
author = {xxx},
biburl = {https://www.bibsonomy.org/bibtex/2f52d941965aae6de909037015d694c49/nmatsuk},
citeulike-article-id = {14610092},
citeulike-linkout-0 = {http://arxiv.org/abs/1805.11604},
citeulike-linkout-1 = {http://arxiv.org/pdf/1805.11604},
day = 27,
eprint = {1805.11604},
interhash = {9a2775c11b374ec5ba4390109e1d4707},
intrahash = {f52d941965aae6de909037015d694c49},
keywords = {arch regularization},
month = oct,
posted-at = {2018-07-02 17:14:52},
priority = {5},
timestamp = {2019-02-27T22:23:29.000+0100},
title = {{How Does Batch Normalization Help Optimization?}},
url = {http://arxiv.org/abs/1805.11604},
year = 2018
}