The noise in stochastic gradient descent (SGD) provides a crucial implicit
regularization effect for training overparameterized models. Prior theoretical
work largely focuses on spherical Gaussian noise, whereas empirical studies
demonstrate the phenomenon that parameter-dependent noise -- induced by
mini-batches or label perturbation -- is far more effective than Gaussian
noise. This paper theoretically characterizes this phenomenon on a
quadratically-parameterized model introduced by Vaskevicius et el. and
Woodworth et el. We show that in an over-parameterized setting, SGD with label
noise recovers the sparse ground-truth with an arbitrary initialization,
whereas SGD with Gaussian noise or gradient descent overfits to dense solutions
with large norms. Our analysis reveals that parameter-dependent noise
introduces a bias towards local minima with smaller noise variance, whereas
spherical Gaussian noise does not. Code for our project is publicly available.
Description
[2006.08680] Shape Matters: Understanding the Implicit Bias of the Noise Covariance
%0 Journal Article
%1 haochen2020shape
%A HaoChen, Jeff Z.
%A Wei, Colin
%A Lee, Jason D.
%A Ma, Tengyu
%D 2020
%K analysis bias noise optimization readings
%T Shape Matters: Understanding the Implicit Bias of the Noise Covariance
%U http://arxiv.org/abs/2006.08680
%X The noise in stochastic gradient descent (SGD) provides a crucial implicit
regularization effect for training overparameterized models. Prior theoretical
work largely focuses on spherical Gaussian noise, whereas empirical studies
demonstrate the phenomenon that parameter-dependent noise -- induced by
mini-batches or label perturbation -- is far more effective than Gaussian
noise. This paper theoretically characterizes this phenomenon on a
quadratically-parameterized model introduced by Vaskevicius et el. and
Woodworth et el. We show that in an over-parameterized setting, SGD with label
noise recovers the sparse ground-truth with an arbitrary initialization,
whereas SGD with Gaussian noise or gradient descent overfits to dense solutions
with large norms. Our analysis reveals that parameter-dependent noise
introduces a bias towards local minima with smaller noise variance, whereas
spherical Gaussian noise does not. Code for our project is publicly available.
@article{haochen2020shape,
abstract = {The noise in stochastic gradient descent (SGD) provides a crucial implicit
regularization effect for training overparameterized models. Prior theoretical
work largely focuses on spherical Gaussian noise, whereas empirical studies
demonstrate the phenomenon that parameter-dependent noise -- induced by
mini-batches or label perturbation -- is far more effective than Gaussian
noise. This paper theoretically characterizes this phenomenon on a
quadratically-parameterized model introduced by Vaskevicius et el. and
Woodworth et el. We show that in an over-parameterized setting, SGD with label
noise recovers the sparse ground-truth with an arbitrary initialization,
whereas SGD with Gaussian noise or gradient descent overfits to dense solutions
with large norms. Our analysis reveals that parameter-dependent noise
introduces a bias towards local minima with smaller noise variance, whereas
spherical Gaussian noise does not. Code for our project is publicly available.},
added-at = {2020-06-18T20:16:55.000+0200},
author = {HaoChen, Jeff Z. and Wei, Colin and Lee, Jason D. and Ma, Tengyu},
biburl = {https://www.bibsonomy.org/bibtex/2f3610e84742d24a4ceb8595679233d7a/kirk86},
description = {[2006.08680] Shape Matters: Understanding the Implicit Bias of the Noise Covariance},
interhash = {d0b1c84dea8d65d20e949468217141a6},
intrahash = {f3610e84742d24a4ceb8595679233d7a},
keywords = {analysis bias noise optimization readings},
note = {cite arxiv:2006.08680},
timestamp = {2020-07-16T13:04:03.000+0200},
title = {Shape Matters: Understanding the Implicit Bias of the Noise Covariance},
url = {http://arxiv.org/abs/2006.08680},
year = 2020
}