Deep learning thrives with large neural networks and large datasets. However,
larger networks and larger datasets result in longer training times that impede
research and development progress. Distributed synchronous SGD offers a
potential solution to this problem by dividing SGD minibatches over a pool of
parallel workers. Yet to make this scheme efficient, the per-worker workload
must be large, which implies nontrivial growth in the SGD minibatch size. In
this paper, we empirically show that on the ImageNet dataset large minibatches
cause optimization difficulties, but when these are addressed the trained
networks exhibit good generalization. Specifically, we show no loss of accuracy
when training with large minibatch sizes up to 8192 images. To achieve this
result, we adopt a hyper-parameter-free linear scaling rule for adjusting
learning rates as a function of minibatch size and develop a new warmup scheme
that overcomes optimization challenges early in training. With these simple
techniques, our Caffe2-based system trains ResNet-50 with a minibatch size of
8192 on 256 GPUs in one hour, while matching small minibatch accuracy. Using
commodity hardware, our implementation achieves ~90% scaling efficiency when
moving from 8 to 256 GPUs. Our findings enable training visual recognition
models on internet-scale data with high efficiency.
Description
Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour
%0 Generic
%1 goyal2017accurate
%A Goyal, Priya
%A Dollár, Piotr
%A Girshick, Ross
%A Noordhuis, Pieter
%A Wesolowski, Lukasz
%A Kyrola, Aapo
%A Tulloch, Andrew
%A Jia, Yangqing
%A He, Kaiming
%D 2017
%K deep dl large-scale networks neural
%T Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour
%U http://arxiv.org/abs/1706.02677
%X Deep learning thrives with large neural networks and large datasets. However,
larger networks and larger datasets result in longer training times that impede
research and development progress. Distributed synchronous SGD offers a
potential solution to this problem by dividing SGD minibatches over a pool of
parallel workers. Yet to make this scheme efficient, the per-worker workload
must be large, which implies nontrivial growth in the SGD minibatch size. In
this paper, we empirically show that on the ImageNet dataset large minibatches
cause optimization difficulties, but when these are addressed the trained
networks exhibit good generalization. Specifically, we show no loss of accuracy
when training with large minibatch sizes up to 8192 images. To achieve this
result, we adopt a hyper-parameter-free linear scaling rule for adjusting
learning rates as a function of minibatch size and develop a new warmup scheme
that overcomes optimization challenges early in training. With these simple
techniques, our Caffe2-based system trains ResNet-50 with a minibatch size of
8192 on 256 GPUs in one hour, while matching small minibatch accuracy. Using
commodity hardware, our implementation achieves ~90% scaling efficiency when
moving from 8 to 256 GPUs. Our findings enable training visual recognition
models on internet-scale data with high efficiency.
@misc{goyal2017accurate,
abstract = {Deep learning thrives with large neural networks and large datasets. However,
larger networks and larger datasets result in longer training times that impede
research and development progress. Distributed synchronous SGD offers a
potential solution to this problem by dividing SGD minibatches over a pool of
parallel workers. Yet to make this scheme efficient, the per-worker workload
must be large, which implies nontrivial growth in the SGD minibatch size. In
this paper, we empirically show that on the ImageNet dataset large minibatches
cause optimization difficulties, but when these are addressed the trained
networks exhibit good generalization. Specifically, we show no loss of accuracy
when training with large minibatch sizes up to 8192 images. To achieve this
result, we adopt a hyper-parameter-free linear scaling rule for adjusting
learning rates as a function of minibatch size and develop a new warmup scheme
that overcomes optimization challenges early in training. With these simple
techniques, our Caffe2-based system trains ResNet-50 with a minibatch size of
8192 on 256 GPUs in one hour, while matching small minibatch accuracy. Using
commodity hardware, our implementation achieves ~90% scaling efficiency when
moving from 8 to 256 GPUs. Our findings enable training visual recognition
models on internet-scale data with high efficiency.},
added-at = {2019-06-04T16:02:20.000+0200},
author = {Goyal, Priya and Dollár, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},
biburl = {https://www.bibsonomy.org/bibtex/2a61c48d31f784ad56ef82a538d23b7cc/alrigazzi},
description = {Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour},
interhash = {2bbbe4b4c06c8471e2c7486aed6d458e},
intrahash = {a61c48d31f784ad56ef82a538d23b7cc},
keywords = {deep dl large-scale networks neural},
note = {cite arxiv:1706.02677Comment: Tech report (v2: correct typos)},
timestamp = {2019-06-04T16:02:20.000+0200},
title = {Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour},
url = {http://arxiv.org/abs/1706.02677},
year = 2017
}