Preconditioned gradient methods are among the most general and powerful tools
in optimization. However, preconditioning requires storing and manipulating
prohibitively large matrices. We describe and analyze a new structure-aware
preconditioning algorithm, called Shampoo, for stochastic optimization over
tensor spaces. Shampoo maintains a set of preconditioning matrices, each of
which operates on a single dimension, contracting over the remaining
dimensions. We establish convergence guarantees in the stochastic convex
setting, the proof of which builds upon matrix trace inequalities. Our
experiments with state-of-the-art deep learning models show that Shampoo is
capable of converging considerably faster than commonly used optimizers.
Although it involves a more complex update rule, Shampoo's runtime per step is
comparable to that of simple gradient methods such as SGD, AdaGrad, and Adam.
%0 Generic
%1 gupta2018shampoo
%A Gupta, Vineet
%A Koren, Tomer
%A Singer, Yoram
%D 2018
%K alternative optimization to_read
%T Shampoo: Preconditioned Stochastic Tensor Optimization
%U http://arxiv.org/abs/1802.09568
%X Preconditioned gradient methods are among the most general and powerful tools
in optimization. However, preconditioning requires storing and manipulating
prohibitively large matrices. We describe and analyze a new structure-aware
preconditioning algorithm, called Shampoo, for stochastic optimization over
tensor spaces. Shampoo maintains a set of preconditioning matrices, each of
which operates on a single dimension, contracting over the remaining
dimensions. We establish convergence guarantees in the stochastic convex
setting, the proof of which builds upon matrix trace inequalities. Our
experiments with state-of-the-art deep learning models show that Shampoo is
capable of converging considerably faster than commonly used optimizers.
Although it involves a more complex update rule, Shampoo's runtime per step is
comparable to that of simple gradient methods such as SGD, AdaGrad, and Adam.
@misc{gupta2018shampoo,
abstract = {Preconditioned gradient methods are among the most general and powerful tools
in optimization. However, preconditioning requires storing and manipulating
prohibitively large matrices. We describe and analyze a new structure-aware
preconditioning algorithm, called Shampoo, for stochastic optimization over
tensor spaces. Shampoo maintains a set of preconditioning matrices, each of
which operates on a single dimension, contracting over the remaining
dimensions. We establish convergence guarantees in the stochastic convex
setting, the proof of which builds upon matrix trace inequalities. Our
experiments with state-of-the-art deep learning models show that Shampoo is
capable of converging considerably faster than commonly used optimizers.
Although it involves a more complex update rule, Shampoo's runtime per step is
comparable to that of simple gradient methods such as SGD, AdaGrad, and Adam.},
added-at = {2018-02-28T08:28:55.000+0100},
author = {Gupta, Vineet and Koren, Tomer and Singer, Yoram},
biburl = {https://www.bibsonomy.org/bibtex/2870e9b59ed13a0510531ecdc1a3e9bf6/jk_itwm},
description = {Shampoo: Preconditioned Stochastic Tensor Optimization},
interhash = {14f191d1045a8b1b25ddcf1e4dd99656},
intrahash = {870e9b59ed13a0510531ecdc1a3e9bf6},
keywords = {alternative optimization to_read},
note = {cite arxiv:1802.09568},
timestamp = {2018-02-28T08:28:55.000+0100},
title = {Shampoo: Preconditioned Stochastic Tensor Optimization},
url = {http://arxiv.org/abs/1802.09568},
year = 2018
}