The stochastic gradient descent (SGD) method and its variants are algorithms
of choice for many Deep Learning tasks. These methods operate in a small-batch
regime wherein a fraction of the training data, say $32$-$512$ data points, is
sampled to compute an approximation to the gradient. It has been observed in
practice that when using a larger batch there is a degradation in the quality
of the model, as measured by its ability to generalize. We investigate the
cause for this generalization drop in the large-batch regime and present
numerical evidence that supports the view that large-batch methods tend to
converge to sharp minimizers of the training and testing functions - and as is
well known, sharp minima lead to poorer generalization. In contrast,
small-batch methods consistently converge to flat minimizers, and our
experiments support a commonly held view that this is due to the inherent noise
in the gradient estimation. We discuss several strategies to attempt to help
large-batch methods eliminate this generalization gap.
Description
[1609.04836] On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima
%0 Journal Article
%1 keskar2016largebatch
%A Keskar, Nitish Shirish
%A Mudigere, Dheevatsa
%A Nocedal, Jorge
%A Smelyanskiy, Mikhail
%A Tang, Ping Tak Peter
%D 2016
%K deep-learning generalization
%T On Large-Batch Training for Deep Learning: Generalization Gap and Sharp
Minima
%U http://arxiv.org/abs/1609.04836
%X The stochastic gradient descent (SGD) method and its variants are algorithms
of choice for many Deep Learning tasks. These methods operate in a small-batch
regime wherein a fraction of the training data, say $32$-$512$ data points, is
sampled to compute an approximation to the gradient. It has been observed in
practice that when using a larger batch there is a degradation in the quality
of the model, as measured by its ability to generalize. We investigate the
cause for this generalization drop in the large-batch regime and present
numerical evidence that supports the view that large-batch methods tend to
converge to sharp minimizers of the training and testing functions - and as is
well known, sharp minima lead to poorer generalization. In contrast,
small-batch methods consistently converge to flat minimizers, and our
experiments support a commonly held view that this is due to the inherent noise
in the gradient estimation. We discuss several strategies to attempt to help
large-batch methods eliminate this generalization gap.
@article{keskar2016largebatch,
abstract = {The stochastic gradient descent (SGD) method and its variants are algorithms
of choice for many Deep Learning tasks. These methods operate in a small-batch
regime wherein a fraction of the training data, say $32$-$512$ data points, is
sampled to compute an approximation to the gradient. It has been observed in
practice that when using a larger batch there is a degradation in the quality
of the model, as measured by its ability to generalize. We investigate the
cause for this generalization drop in the large-batch regime and present
numerical evidence that supports the view that large-batch methods tend to
converge to sharp minimizers of the training and testing functions - and as is
well known, sharp minima lead to poorer generalization. In contrast,
small-batch methods consistently converge to flat minimizers, and our
experiments support a commonly held view that this is due to the inherent noise
in the gradient estimation. We discuss several strategies to attempt to help
large-batch methods eliminate this generalization gap.},
added-at = {2019-09-03T19:59:57.000+0200},
author = {Keskar, Nitish Shirish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tang, Ping Tak Peter},
biburl = {https://www.bibsonomy.org/bibtex/2d27090a1cbf1b2fc5d737f7f543f0aac/kirk86},
description = {[1609.04836] On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima},
interhash = {7a321493ef739181c4d243e2145b0da5},
intrahash = {d27090a1cbf1b2fc5d737f7f543f0aac},
keywords = {deep-learning generalization},
note = {cite arxiv:1609.04836Comment: Accepted as a conference paper at ICLR 2017},
timestamp = {2019-09-03T19:59:57.000+0200},
title = {On Large-Batch Training for Deep Learning: Generalization Gap and Sharp
Minima},
url = {http://arxiv.org/abs/1609.04836},
year = 2016
}