Training large neural networks requires distributing learning across multiple
workers, where the cost of communicating gradients can be a significant
bottleneck. signSGD alleviates this problem by transmitting just the sign of
each minibatch stochastic gradient. We prove that it can get the best of both
worlds: compressed gradients and SGD-level convergence rate. signSGD can
exploit mismatches between L1 and L2 geometry: when noise and curvature are
much sparser than the gradients, signSGD is expected to converge at the same
rate or faster than full-precision SGD. Measurements of the L1 versus L2
geometry of real networks support our theoretical claims, and we find that the
momentum counterpart of signSGD is able to match the accuracy and convergence
speed of Adam on deep Imagenet models. We extend our theory to the distributed
setting, where the parameter server uses majority vote to aggregate gradient
signs from each worker enabling 1-bit compression of worker-server
communication in both directions. Using a theorem by Gauss, we prove that the
non-convex convergence rate of majority vote matches that of distributed SGD.
Thus, there is great promise for sign-based optimisation schemes to achieve
both communication efficiency and high accuracy.
Description
signSGD: compressed optimisation for non-convex problems
%0 Generic
%1 bernstein2018signsgd
%A Bernstein, Jeremy
%A Wang, Yu-Xiang
%A Azizzadenesheli, Kamyar
%A Anandkumar, Anima
%D 2018
%K SGD compression optimization
%T signSGD: compressed optimisation for non-convex problems
%U http://arxiv.org/abs/1802.04434
%X Training large neural networks requires distributing learning across multiple
workers, where the cost of communicating gradients can be a significant
bottleneck. signSGD alleviates this problem by transmitting just the sign of
each minibatch stochastic gradient. We prove that it can get the best of both
worlds: compressed gradients and SGD-level convergence rate. signSGD can
exploit mismatches between L1 and L2 geometry: when noise and curvature are
much sparser than the gradients, signSGD is expected to converge at the same
rate or faster than full-precision SGD. Measurements of the L1 versus L2
geometry of real networks support our theoretical claims, and we find that the
momentum counterpart of signSGD is able to match the accuracy and convergence
speed of Adam on deep Imagenet models. We extend our theory to the distributed
setting, where the parameter server uses majority vote to aggregate gradient
signs from each worker enabling 1-bit compression of worker-server
communication in both directions. Using a theorem by Gauss, we prove that the
non-convex convergence rate of majority vote matches that of distributed SGD.
Thus, there is great promise for sign-based optimisation schemes to achieve
both communication efficiency and high accuracy.
@misc{bernstein2018signsgd,
abstract = {Training large neural networks requires distributing learning across multiple
workers, where the cost of communicating gradients can be a significant
bottleneck. signSGD alleviates this problem by transmitting just the sign of
each minibatch stochastic gradient. We prove that it can get the best of both
worlds: compressed gradients and SGD-level convergence rate. signSGD can
exploit mismatches between L1 and L2 geometry: when noise and curvature are
much sparser than the gradients, signSGD is expected to converge at the same
rate or faster than full-precision SGD. Measurements of the L1 versus L2
geometry of real networks support our theoretical claims, and we find that the
momentum counterpart of signSGD is able to match the accuracy and convergence
speed of Adam on deep Imagenet models. We extend our theory to the distributed
setting, where the parameter server uses majority vote to aggregate gradient
signs from each worker enabling 1-bit compression of worker-server
communication in both directions. Using a theorem by Gauss, we prove that the
non-convex convergence rate of majority vote matches that of distributed SGD.
Thus, there is great promise for sign-based optimisation schemes to achieve
both communication efficiency and high accuracy.},
added-at = {2018-02-14T21:21:13.000+0100},
author = {Bernstein, Jeremy and Wang, Yu-Xiang and Azizzadenesheli, Kamyar and Anandkumar, Anima},
biburl = {https://www.bibsonomy.org/bibtex/2f97262ce7bfc7c3a9c56d379aacc2921/jk_itwm},
description = {signSGD: compressed optimisation for non-convex problems},
interhash = {a579bb8583ea2fdba022637317600f88},
intrahash = {f97262ce7bfc7c3a9c56d379aacc2921},
keywords = {SGD compression optimization},
note = {cite arxiv:1802.04434},
timestamp = {2018-02-14T21:21:13.000+0100},
title = {signSGD: compressed optimisation for non-convex problems},
url = {http://arxiv.org/abs/1802.04434},
year = 2018
}