Despite the widespread practical success of deep learning methods, our
theoretical understanding of the dynamics of learning in deep neural networks
remains quite sparse. We attempt to bridge the gap between the theory and
practice of deep learning by systematically analyzing learning dynamics for the
restricted case of deep linear neural networks. Despite the linearity of their
input-output map, such networks have nonlinear gradient descent dynamics on
weights that change with the addition of each new hidden layer. We show that
deep linear networks exhibit nonlinear learning phenomena similar to those seen
in simulations of nonlinear networks, including long plateaus followed by rapid
transitions to lower error solutions, and faster convergence from greedy
unsupervised pretraining initial conditions than from random initial
conditions. We provide an analytical description of these phenomena by finding
new exact solutions to the nonlinear dynamics of deep learning. Our theoretical
analysis also reveals the surprising finding that as the depth of a network
approaches infinity, learning speed remains finite: for a special class of
initial conditions on the weights, very deep networks incur only a finite delay
in learning speed relative to shallow networks. We further show that, under
certain conditions on the training data, unsupervised pretraining can find this
special class of initial conditions, thereby providing analytical insight into
the success of unsupervised pretraining in deep supervised learning tasks.
Description
Exact solutions to the nonlinear dynamics of learning in deep linear
neural networks
%0 Generic
%1 saxe2013exact
%A Saxe, Andrew M.
%A McClelland, James L.
%A Ganguli, Surya
%D 2013
%K deep learning
%T Exact solutions to the nonlinear dynamics of learning in deep linear
neural networks
%U http://arxiv.org/abs/1312.6120
%X Despite the widespread practical success of deep learning methods, our
theoretical understanding of the dynamics of learning in deep neural networks
remains quite sparse. We attempt to bridge the gap between the theory and
practice of deep learning by systematically analyzing learning dynamics for the
restricted case of deep linear neural networks. Despite the linearity of their
input-output map, such networks have nonlinear gradient descent dynamics on
weights that change with the addition of each new hidden layer. We show that
deep linear networks exhibit nonlinear learning phenomena similar to those seen
in simulations of nonlinear networks, including long plateaus followed by rapid
transitions to lower error solutions, and faster convergence from greedy
unsupervised pretraining initial conditions than from random initial
conditions. We provide an analytical description of these phenomena by finding
new exact solutions to the nonlinear dynamics of deep learning. Our theoretical
analysis also reveals the surprising finding that as the depth of a network
approaches infinity, learning speed remains finite: for a special class of
initial conditions on the weights, very deep networks incur only a finite delay
in learning speed relative to shallow networks. We further show that, under
certain conditions on the training data, unsupervised pretraining can find this
special class of initial conditions, thereby providing analytical insight into
the success of unsupervised pretraining in deep supervised learning tasks.
@preprint{saxe2013exact,
abstract = {Despite the widespread practical success of deep learning methods, our
theoretical understanding of the dynamics of learning in deep neural networks
remains quite sparse. We attempt to bridge the gap between the theory and
practice of deep learning by systematically analyzing learning dynamics for the
restricted case of deep linear neural networks. Despite the linearity of their
input-output map, such networks have nonlinear gradient descent dynamics on
weights that change with the addition of each new hidden layer. We show that
deep linear networks exhibit nonlinear learning phenomena similar to those seen
in simulations of nonlinear networks, including long plateaus followed by rapid
transitions to lower error solutions, and faster convergence from greedy
unsupervised pretraining initial conditions than from random initial
conditions. We provide an analytical description of these phenomena by finding
new exact solutions to the nonlinear dynamics of deep learning. Our theoretical
analysis also reveals the surprising finding that as the depth of a network
approaches infinity, learning speed remains finite: for a special class of
initial conditions on the weights, very deep networks incur only a finite delay
in learning speed relative to shallow networks. We further show that, under
certain conditions on the training data, unsupervised pretraining can find this
special class of initial conditions, thereby providing analytical insight into
the success of unsupervised pretraining in deep supervised learning tasks.},
added-at = {2014-01-07T21:05:30.000+0100},
author = {Saxe, Andrew M. and McClelland, James L. and Ganguli, Surya},
biburl = {https://www.bibsonomy.org/bibtex/22369554f743ca15f2b831e620e9d63b2/renatovicente},
description = {Exact solutions to the nonlinear dynamics of learning in deep linear
neural networks},
interhash = {514bf2cf3bb1442e8d1d0c2ab68e1285},
intrahash = {2369554f743ca15f2b831e620e9d63b2},
keywords = {deep learning},
note = {cite arxiv:1312.6120},
timestamp = {2014-01-07T21:05:30.000+0100},
title = {Exact solutions to the nonlinear dynamics of learning in deep linear
neural networks},
url = {http://arxiv.org/abs/1312.6120},
year = 2013
}