Batch normalization has multiple benefits. It improves the conditioning of
the loss landscape, and is a surprisingly effective regularizer. However, the
most important benefit of batch normalization arises in residual networks,
where it dramatically increases the largest trainable depth. We identify the
origin of this benefit: At initialization, batch normalization downscales the
residual branch relative to the skip connection, by a normalizing factor
proportional to the square root of the network depth. This ensures that, early
in training, the function computed by deep normalized residual networks is
dominated by shallow paths with well-behaved gradients. We use this insight to
develop a simple initialization scheme which can train very deep residual
networks without normalization. We also clarify that, although batch
normalization does enable stable training with larger learning rates, this
benefit is only useful when one wishes to parallelize training over large batch
sizes. Our results help isolate the distinct benefits of batch normalization in
different architectures.
Description
[2002.10444] Batch Normalization Biases Deep Residual Networks Towards Shallow Paths
%0 Journal Article
%1 de2020batch
%A De, Soham
%A Smith, Samuel L.
%D 2020
%K deep-learning optimization readings
%T Batch Normalization Biases Deep Residual Networks Towards Shallow Paths
%U http://arxiv.org/abs/2002.10444
%X Batch normalization has multiple benefits. It improves the conditioning of
the loss landscape, and is a surprisingly effective regularizer. However, the
most important benefit of batch normalization arises in residual networks,
where it dramatically increases the largest trainable depth. We identify the
origin of this benefit: At initialization, batch normalization downscales the
residual branch relative to the skip connection, by a normalizing factor
proportional to the square root of the network depth. This ensures that, early
in training, the function computed by deep normalized residual networks is
dominated by shallow paths with well-behaved gradients. We use this insight to
develop a simple initialization scheme which can train very deep residual
networks without normalization. We also clarify that, although batch
normalization does enable stable training with larger learning rates, this
benefit is only useful when one wishes to parallelize training over large batch
sizes. Our results help isolate the distinct benefits of batch normalization in
different architectures.
@article{de2020batch,
abstract = {Batch normalization has multiple benefits. It improves the conditioning of
the loss landscape, and is a surprisingly effective regularizer. However, the
most important benefit of batch normalization arises in residual networks,
where it dramatically increases the largest trainable depth. We identify the
origin of this benefit: At initialization, batch normalization downscales the
residual branch relative to the skip connection, by a normalizing factor
proportional to the square root of the network depth. This ensures that, early
in training, the function computed by deep normalized residual networks is
dominated by shallow paths with well-behaved gradients. We use this insight to
develop a simple initialization scheme which can train very deep residual
networks without normalization. We also clarify that, although batch
normalization does enable stable training with larger learning rates, this
benefit is only useful when one wishes to parallelize training over large batch
sizes. Our results help isolate the distinct benefits of batch normalization in
different architectures.},
added-at = {2020-02-26T14:10:01.000+0100},
author = {De, Soham and Smith, Samuel L.},
biburl = {https://www.bibsonomy.org/bibtex/25af9ac68fbe40e0c06dddca2cade7a56/kirk86},
description = {[2002.10444] Batch Normalization Biases Deep Residual Networks Towards Shallow Paths},
interhash = {27d6c0c7f4e53535c76ecb7146bc02cb},
intrahash = {5af9ac68fbe40e0c06dddca2cade7a56},
keywords = {deep-learning optimization readings},
note = {cite arxiv:2002.10444},
timestamp = {2020-02-26T14:10:01.000+0100},
title = {Batch Normalization Biases Deep Residual Networks Towards Shallow Paths},
url = {http://arxiv.org/abs/2002.10444},
year = 2020
}