Momentum Stochastic Gradient Descent (MSGD) algorithm has been widely applied
to many nonconvex optimization problems in machine learning. Popular examples
include training deep neural networks, dimensionality reduction, and etc. Due
to the lack of convexity and the extra momentum term, the optimization theory
of MSGD is still largely unknown. In this paper, we study this fundamental
optimization algorithm based on the so-called "strict saddle problem." By
diffusion approximation type analysis, our study shows that the momentum helps
escape from saddle points, but hurts the convergence within the neighborhood of
optima (if without the step size annealing). Our theoretical discovery
partially corroborates the empirical success of MSGD in training deep neural
networks. Moreover, our analysis applies the martingale method and
"Fixed-State-Chain" method from the stochastic approximation literature, which
are of independent interest.
Description
Toward Deeper Understanding of Nonconvex Stochastic Optimization with
Momentum using Diffusion Approximations
%0 Generic
%1 liu2018toward
%A Liu, Tianyi
%A Chen, Zhehui
%A Zhou, Enlu
%A Zhao, Tuo
%D 2018
%K SGD optimization theory
%T Toward Deeper Understanding of Nonconvex Stochastic Optimization with
Momentum using Diffusion Approximations
%U http://arxiv.org/abs/1802.05155
%X Momentum Stochastic Gradient Descent (MSGD) algorithm has been widely applied
to many nonconvex optimization problems in machine learning. Popular examples
include training deep neural networks, dimensionality reduction, and etc. Due
to the lack of convexity and the extra momentum term, the optimization theory
of MSGD is still largely unknown. In this paper, we study this fundamental
optimization algorithm based on the so-called "strict saddle problem." By
diffusion approximation type analysis, our study shows that the momentum helps
escape from saddle points, but hurts the convergence within the neighborhood of
optima (if without the step size annealing). Our theoretical discovery
partially corroborates the empirical success of MSGD in training deep neural
networks. Moreover, our analysis applies the martingale method and
"Fixed-State-Chain" method from the stochastic approximation literature, which
are of independent interest.
@misc{liu2018toward,
abstract = {Momentum Stochastic Gradient Descent (MSGD) algorithm has been widely applied
to many nonconvex optimization problems in machine learning. Popular examples
include training deep neural networks, dimensionality reduction, and etc. Due
to the lack of convexity and the extra momentum term, the optimization theory
of MSGD is still largely unknown. In this paper, we study this fundamental
optimization algorithm based on the so-called "strict saddle problem." By
diffusion approximation type analysis, our study shows that the momentum helps
escape from saddle points, but hurts the convergence within the neighborhood of
optima (if without the step size annealing). Our theoretical discovery
partially corroborates the empirical success of MSGD in training deep neural
networks. Moreover, our analysis applies the martingale method and
"Fixed-State-Chain" method from the stochastic approximation literature, which
are of independent interest.},
added-at = {2018-02-16T09:04:11.000+0100},
author = {Liu, Tianyi and Chen, Zhehui and Zhou, Enlu and Zhao, Tuo},
biburl = {https://www.bibsonomy.org/bibtex/28f7e25c7ba7c1a7d9001d6a0da3e99be/jk_itwm},
description = {Toward Deeper Understanding of Nonconvex Stochastic Optimization with
Momentum using Diffusion Approximations},
interhash = {356d5f9d19553b098624e5db359abbb8},
intrahash = {8f7e25c7ba7c1a7d9001d6a0da3e99be},
keywords = {SGD optimization theory},
note = {cite arxiv:1802.05155},
timestamp = {2018-02-16T09:04:11.000+0100},
title = {Toward Deeper Understanding of Nonconvex Stochastic Optimization with
Momentum using Diffusion Approximations},
url = {http://arxiv.org/abs/1802.05155},
year = 2018
}