We provide a simple proof of the convergence of the optimization algorithms
Adam and Adagrad with the assumptions of smooth gradients and almost sure
uniform bound on the $\ell_ınfty$ norm of the gradients. This work builds on
the techniques introduced by Ward et al. (2019) and extends them to the Adam
optimizer. We show that in expectation, the squared norm of the objective
gradient averaged over the trajectory has an upper-bound which is explicit in
the constants of the problem, parameters of the optimizer and the total number
of iterations N. This bound can be made arbitrarily small. In particular, Adam
with a learning rate $\alpha=1/N$ and a momentum parameter on squared
gradients $\beta_2=1 - 1/N$ achieves the same rate of convergence
$O(łn(N)/N)$ as Adagrad. Thus, it is possible to use Adam as a finite
horizon version of Adagrad, much like constant step size SGD can be used
instead of its asymptotically converging decaying step size version.
Description
[2003.02395] On the Convergence of Adam and Adagrad
%0 Journal Article
%1 defossez2020convergence
%A Défossez, Alexandre
%A Bottou, Léon
%A Bach, Francis
%A Usunier, Nicolas
%D 2020
%K convergence optimization
%T On the Convergence of Adam and Adagrad
%U http://arxiv.org/abs/2003.02395
%X We provide a simple proof of the convergence of the optimization algorithms
Adam and Adagrad with the assumptions of smooth gradients and almost sure
uniform bound on the $\ell_ınfty$ norm of the gradients. This work builds on
the techniques introduced by Ward et al. (2019) and extends them to the Adam
optimizer. We show that in expectation, the squared norm of the objective
gradient averaged over the trajectory has an upper-bound which is explicit in
the constants of the problem, parameters of the optimizer and the total number
of iterations N. This bound can be made arbitrarily small. In particular, Adam
with a learning rate $\alpha=1/N$ and a momentum parameter on squared
gradients $\beta_2=1 - 1/N$ achieves the same rate of convergence
$O(łn(N)/N)$ as Adagrad. Thus, it is possible to use Adam as a finite
horizon version of Adagrad, much like constant step size SGD can be used
instead of its asymptotically converging decaying step size version.
@article{defossez2020convergence,
abstract = {We provide a simple proof of the convergence of the optimization algorithms
Adam and Adagrad with the assumptions of smooth gradients and almost sure
uniform bound on the $\ell_\infty$ norm of the gradients. This work builds on
the techniques introduced by Ward et al. (2019) and extends them to the Adam
optimizer. We show that in expectation, the squared norm of the objective
gradient averaged over the trajectory has an upper-bound which is explicit in
the constants of the problem, parameters of the optimizer and the total number
of iterations N. This bound can be made arbitrarily small. In particular, Adam
with a learning rate $\alpha=1/\sqrt{N}$ and a momentum parameter on squared
gradients $\beta_2=1 - 1/N$ achieves the same rate of convergence
$O(\ln(N)/\sqrt{N})$ as Adagrad. Thus, it is possible to use Adam as a finite
horizon version of Adagrad, much like constant step size SGD can be used
instead of its asymptotically converging decaying step size version.},
added-at = {2020-03-15T03:58:44.000+0100},
author = {Défossez, Alexandre and Bottou, Léon and Bach, Francis and Usunier, Nicolas},
biburl = {https://www.bibsonomy.org/bibtex/296a5069bf1d0f4f81088ddc30c54b159/kirk86},
description = {[2003.02395] On the Convergence of Adam and Adagrad},
interhash = {7bc07118493087a31d4bfd88659e15f5},
intrahash = {96a5069bf1d0f4f81088ddc30c54b159},
keywords = {convergence optimization},
note = {cite arxiv:2003.02395Comment: 19 pages, 0 figures, preprint version},
timestamp = {2020-03-15T03:58:44.000+0100},
title = {On the Convergence of Adam and Adagrad},
url = {http://arxiv.org/abs/2003.02395},
year = 2020
}