The predominant methodology in training
deep learning advocates the use of stochastic
gradient descent methods (SGDs). Despite
its ease of implementation, SGDs are diffi-
cult to tune and parallelize. These problems
make it challenging to develop, debug and
scale up deep learning algorithms with SGDs.
In this paper, we show that more sophisti-
cated off-the-shelf optimization methods such
as Limited memory BFGS (L-BFGS) and
Conjugate gradient (CG) with line search
can significantly simplify and speed up the
process of pretraining deep algorithms. In
our experiments, the difference between L-
BFGS/CG and SGDs are more pronounced
if we consider algorithmic extensions (e.g.,
sparsity regularization) and hardware ex-
tensions (e.g., GPUs or computer clusters).
Our experiments with distributed optimiza-
tion support the use of L-BFGS with locally
connected networks and convolutional neural
networks. Using L-BFGS, our convolutional
network model achieves 0.69% on the stan-
dard MNIST dataset. This is a state-of-the-
art result on MNIST among algorithms that
do not use distortions or pretraining.
%0 Conference Paper
%1 conf/icml/LeNCLPN11
%A Le, Quoc V.
%A Ngiam, Jiquan
%A Coates, Adam
%A Lahiri, Ahbik
%A Prochnow, Bobby
%A Ng, Andrew Y.
%B ICML
%D 2011
%E Getoor, Lise
%E Scheffer, Tobias
%I Omnipress
%K algorithm neural_networks optimization thema thema:optimization_algorithms
%P 265-272
%T On optimization methods for deep learning.
%U http://dblp.uni-trier.de/db/conf/icml/icml2011.html#LeNCLPN11
%X The predominant methodology in training
deep learning advocates the use of stochastic
gradient descent methods (SGDs). Despite
its ease of implementation, SGDs are diffi-
cult to tune and parallelize. These problems
make it challenging to develop, debug and
scale up deep learning algorithms with SGDs.
In this paper, we show that more sophisti-
cated off-the-shelf optimization methods such
as Limited memory BFGS (L-BFGS) and
Conjugate gradient (CG) with line search
can significantly simplify and speed up the
process of pretraining deep algorithms. In
our experiments, the difference between L-
BFGS/CG and SGDs are more pronounced
if we consider algorithmic extensions (e.g.,
sparsity regularization) and hardware ex-
tensions (e.g., GPUs or computer clusters).
Our experiments with distributed optimiza-
tion support the use of L-BFGS with locally
connected networks and convolutional neural
networks. Using L-BFGS, our convolutional
network model achieves 0.69% on the stan-
dard MNIST dataset. This is a state-of-the-
art result on MNIST among algorithms that
do not use distortions or pretraining.
@inproceedings{conf/icml/LeNCLPN11,
abstract = {The predominant methodology in training
deep learning advocates the use of stochastic
gradient descent methods (SGDs). Despite
its ease of implementation, SGDs are diffi-
cult to tune and parallelize. These problems
make it challenging to develop, debug and
scale up deep learning algorithms with SGDs.
In this paper, we show that more sophisti-
cated off-the-shelf optimization methods such
as Limited memory BFGS (L-BFGS) and
Conjugate gradient (CG) with line search
can significantly simplify and speed up the
process of pretraining deep algorithms. In
our experiments, the difference between L-
BFGS/CG and SGDs are more pronounced
if we consider algorithmic extensions (e.g.,
sparsity regularization) and hardware ex-
tensions (e.g., GPUs or computer clusters).
Our experiments with distributed optimiza-
tion support the use of L-BFGS with locally
connected networks and convolutional neural
networks. Using L-BFGS, our convolutional
network model achieves 0.69% on the stan-
dard MNIST dataset. This is a state-of-the-
art result on MNIST among algorithms that
do not use distortions or pretraining.},
added-at = {2016-09-29T17:16:48.000+0200},
author = {Le, Quoc V. and Ngiam, Jiquan and Coates, Adam and Lahiri, Ahbik and Prochnow, Bobby and Ng, Andrew Y.},
biburl = {https://www.bibsonomy.org/bibtex/238963a5573e7aad3fae85240877248d4/dallmann},
booktitle = {ICML},
editor = {Getoor, Lise and Scheffer, Tobias},
interhash = {010b5d47baaa2c4b621d6587e82de13c},
intrahash = {38963a5573e7aad3fae85240877248d4},
keywords = {algorithm neural_networks optimization thema thema:optimization_algorithms},
pages = {265-272},
publisher = {Omnipress},
timestamp = {2016-09-29T17:16:48.000+0200},
title = {On optimization methods for deep learning.},
url = {http://dblp.uni-trier.de/db/conf/icml/icml2011.html#LeNCLPN11},
year = 2011
}