We analyze the variance of stochastic gradients along negative curvature
directions in certain non-convex machine learning models and show that
stochastic gradients exhibit a strong component along these directions.
Furthermore, we show that - contrary to the case of isotropic noise - this
variance is proportional to the magnitude of the corresponding eigenvalues and
not decreasing in the dimensionality. Based upon this observation we propose a
new assumption under which we show that the injection of explicit, isotropic
noise usually applied to make gradient descent escape saddle points can
successfully be replaced by a simple SGD step. Additionally - and under the
same condition - we derive the first convergence rate for plain SGD to a
second-order stationary point in a number of iterations that is independent of
the problem dimension.
%0 Generic
%1 daneshmand2018escaping
%A Daneshmand, Hadi
%A Kohler, Jonas
%A Lucchi, Aurelien
%A Hofmann, Thomas
%D 2018
%K SGD optimization theory
%T Escaping Saddles with Stochastic Gradients
%U http://arxiv.org/abs/1803.05999
%X We analyze the variance of stochastic gradients along negative curvature
directions in certain non-convex machine learning models and show that
stochastic gradients exhibit a strong component along these directions.
Furthermore, we show that - contrary to the case of isotropic noise - this
variance is proportional to the magnitude of the corresponding eigenvalues and
not decreasing in the dimensionality. Based upon this observation we propose a
new assumption under which we show that the injection of explicit, isotropic
noise usually applied to make gradient descent escape saddle points can
successfully be replaced by a simple SGD step. Additionally - and under the
same condition - we derive the first convergence rate for plain SGD to a
second-order stationary point in a number of iterations that is independent of
the problem dimension.
@misc{daneshmand2018escaping,
abstract = {We analyze the variance of stochastic gradients along negative curvature
directions in certain non-convex machine learning models and show that
stochastic gradients exhibit a strong component along these directions.
Furthermore, we show that - contrary to the case of isotropic noise - this
variance is proportional to the magnitude of the corresponding eigenvalues and
not decreasing in the dimensionality. Based upon this observation we propose a
new assumption under which we show that the injection of explicit, isotropic
noise usually applied to make gradient descent escape saddle points can
successfully be replaced by a simple SGD step. Additionally - and under the
same condition - we derive the first convergence rate for plain SGD to a
second-order stationary point in a number of iterations that is independent of
the problem dimension.},
added-at = {2018-03-19T13:31:45.000+0100},
author = {Daneshmand, Hadi and Kohler, Jonas and Lucchi, Aurelien and Hofmann, Thomas},
biburl = {https://www.bibsonomy.org/bibtex/26b6115fdedfe6568386851013f553d32/jk_itwm},
description = {Escaping Saddles with Stochastic Gradients},
interhash = {61f6ef607b75707d7c4730900c3ebc70},
intrahash = {6b6115fdedfe6568386851013f553d32},
keywords = {SGD optimization theory},
note = {cite arxiv:1803.05999},
timestamp = {2018-03-19T13:31:45.000+0100},
title = {Escaping Saddles with Stochastic Gradients},
url = {http://arxiv.org/abs/1803.05999},
year = 2018
}