We analyze the dynamics of training deep ReLU networks and their implications
on generalization capability. Using a teacher-student setting, we discovered a
novel relationship between the gradient received by hidden student nodes and
the activations of teacher nodes for deep ReLU networks. With this relationship
and the assumption of small overlapping teacher node activations, we prove that
(1) student nodes whose weights are initialized to be close to teacher nodes
converge to them at a faster rate, and (2) in over-parameterized regimes and
2-layer case, while a small set of lucky nodes do converge to the teacher
nodes, the fan-out weights of other nodes converge to zero. This framework
provides insight into multiple puzzling phenomena in deep learning like
over-parameterization, implicit regularization, lottery tickets, etc. We verify
our assumption by showing that the majority of BatchNorm biases of pre-trained
VGG11/16 models are negative. Experiments on (1) random deep teacher networks
with Gaussian inputs, (2) teacher network pre-trained on CIFAR-10 and (3)
extensive ablation studies validate our multiple theoretical predictions.
Description
[1905.13405] Luck Matters: Understanding Training Dynamics of Deep ReLU Networks
%0 Journal Article
%1 tian2019matters
%A Tian, Yuandong
%A Jiang, Tina
%A Gong, Qucheng
%A Morcos, Ari
%D 2019
%K deep-learning dynamic machine-learning optimization theory
%T Luck Matters: Understanding Training Dynamics of Deep ReLU Networks
%U http://arxiv.org/abs/1905.13405
%X We analyze the dynamics of training deep ReLU networks and their implications
on generalization capability. Using a teacher-student setting, we discovered a
novel relationship between the gradient received by hidden student nodes and
the activations of teacher nodes for deep ReLU networks. With this relationship
and the assumption of small overlapping teacher node activations, we prove that
(1) student nodes whose weights are initialized to be close to teacher nodes
converge to them at a faster rate, and (2) in over-parameterized regimes and
2-layer case, while a small set of lucky nodes do converge to the teacher
nodes, the fan-out weights of other nodes converge to zero. This framework
provides insight into multiple puzzling phenomena in deep learning like
over-parameterization, implicit regularization, lottery tickets, etc. We verify
our assumption by showing that the majority of BatchNorm biases of pre-trained
VGG11/16 models are negative. Experiments on (1) random deep teacher networks
with Gaussian inputs, (2) teacher network pre-trained on CIFAR-10 and (3)
extensive ablation studies validate our multiple theoretical predictions.
@article{tian2019matters,
abstract = {We analyze the dynamics of training deep ReLU networks and their implications
on generalization capability. Using a teacher-student setting, we discovered a
novel relationship between the gradient received by hidden student nodes and
the activations of teacher nodes for deep ReLU networks. With this relationship
and the assumption of small overlapping teacher node activations, we prove that
(1) student nodes whose weights are initialized to be close to teacher nodes
converge to them at a faster rate, and (2) in over-parameterized regimes and
2-layer case, while a small set of lucky nodes do converge to the teacher
nodes, the fan-out weights of other nodes converge to zero. This framework
provides insight into multiple puzzling phenomena in deep learning like
over-parameterization, implicit regularization, lottery tickets, etc. We verify
our assumption by showing that the majority of BatchNorm biases of pre-trained
VGG11/16 models are negative. Experiments on (1) random deep teacher networks
with Gaussian inputs, (2) teacher network pre-trained on CIFAR-10 and (3)
extensive ablation studies validate our multiple theoretical predictions.},
added-at = {2019-06-04T13:42:59.000+0200},
author = {Tian, Yuandong and Jiang, Tina and Gong, Qucheng and Morcos, Ari},
biburl = {https://www.bibsonomy.org/bibtex/2123dca5ccb5b9238ab573fe437d330a3/kirk86},
description = {[1905.13405] Luck Matters: Understanding Training Dynamics of Deep ReLU Networks},
interhash = {9de05c42f110544b30b73af3ac00b745},
intrahash = {123dca5ccb5b9238ab573fe437d330a3},
keywords = {deep-learning dynamic machine-learning optimization theory},
note = {cite arxiv:1905.13405},
timestamp = {2020-01-13T12:54:17.000+0100},
title = {Luck Matters: Understanding Training Dynamics of Deep ReLU Networks},
url = {http://arxiv.org/abs/1905.13405},
year = 2019
}