The classical bias-variance trade-off predicts that bias decreases and
variance increase with model complexity, leading to a U-shaped risk curve.
Recent work calls this into question for neural networks and other
over-parameterized models, for which it is often observed that larger models
generalize better. We provide a simple explanation for this by measuring the
bias and variance of neural networks: while the bias is monotonically
decreasing as in the classical theory, the variance is unimodal or bell-shaped:
it increases then decreases with the width of the network. We vary the network
architecture, loss function, and choice of dataset and confirm that variance
unimodality occurs robustly for all models we considered. The risk curve is the
sum of the bias and variance curves and displays different qualitative shapes
depending on the relative scale of bias and variance, with the double descent
curve observed in recent literature as a special case. We corroborate these
empirical results with a theoretical analysis of two-layer linear networks with
random first layer. Finally, evaluation on out-of-distribution data shows that
most of the drop in accuracy comes from increased bias while variance increases
by a relatively small amount. Moreover, we find that deeper models decrease
bias and increase variance for both in-distribution and out-of-distribution
data.
Description
[2002.11328] Rethinking Bias-Variance Trade-off for Generalization of Neural Networks
%0 Journal Article
%1 yang2020rethinking
%A Yang, Zitong
%A Yu, Yaodong
%A You, Chong
%A Steinhardt, Jacob
%A Ma, Yi
%D 2020
%K analysis bias generalization readings variance
%T Rethinking Bias-Variance Trade-off for Generalization of Neural Networks
%U http://arxiv.org/abs/2002.11328
%X The classical bias-variance trade-off predicts that bias decreases and
variance increase with model complexity, leading to a U-shaped risk curve.
Recent work calls this into question for neural networks and other
over-parameterized models, for which it is often observed that larger models
generalize better. We provide a simple explanation for this by measuring the
bias and variance of neural networks: while the bias is monotonically
decreasing as in the classical theory, the variance is unimodal or bell-shaped:
it increases then decreases with the width of the network. We vary the network
architecture, loss function, and choice of dataset and confirm that variance
unimodality occurs robustly for all models we considered. The risk curve is the
sum of the bias and variance curves and displays different qualitative shapes
depending on the relative scale of bias and variance, with the double descent
curve observed in recent literature as a special case. We corroborate these
empirical results with a theoretical analysis of two-layer linear networks with
random first layer. Finally, evaluation on out-of-distribution data shows that
most of the drop in accuracy comes from increased bias while variance increases
by a relatively small amount. Moreover, we find that deeper models decrease
bias and increase variance for both in-distribution and out-of-distribution
data.
@article{yang2020rethinking,
abstract = {The classical bias-variance trade-off predicts that bias decreases and
variance increase with model complexity, leading to a U-shaped risk curve.
Recent work calls this into question for neural networks and other
over-parameterized models, for which it is often observed that larger models
generalize better. We provide a simple explanation for this by measuring the
bias and variance of neural networks: while the bias is monotonically
decreasing as in the classical theory, the variance is unimodal or bell-shaped:
it increases then decreases with the width of the network. We vary the network
architecture, loss function, and choice of dataset and confirm that variance
unimodality occurs robustly for all models we considered. The risk curve is the
sum of the bias and variance curves and displays different qualitative shapes
depending on the relative scale of bias and variance, with the double descent
curve observed in recent literature as a special case. We corroborate these
empirical results with a theoretical analysis of two-layer linear networks with
random first layer. Finally, evaluation on out-of-distribution data shows that
most of the drop in accuracy comes from increased bias while variance increases
by a relatively small amount. Moreover, we find that deeper models decrease
bias and increase variance for both in-distribution and out-of-distribution
data.},
added-at = {2020-07-16T12:44:32.000+0200},
author = {Yang, Zitong and Yu, Yaodong and You, Chong and Steinhardt, Jacob and Ma, Yi},
biburl = {https://www.bibsonomy.org/bibtex/2d8e88c1a0ca9614b9d03a6445f7268fa/kirk86},
description = {[2002.11328] Rethinking Bias-Variance Trade-off for Generalization of Neural Networks},
interhash = {84280a2e4c558612d0bbd98146d75651},
intrahash = {d8e88c1a0ca9614b9d03a6445f7268fa},
keywords = {analysis bias generalization readings variance},
note = {cite arxiv:2002.11328},
timestamp = {2020-07-16T12:44:32.000+0200},
title = {Rethinking Bias-Variance Trade-off for Generalization of Neural Networks},
url = {http://arxiv.org/abs/2002.11328},
year = 2020
}