We analyze the growth of dataset sizes used in machine learning for natural
language processing and computer vision, and extrapolate these using two
methods; using the historical growth rate and estimating the compute-optimal
dataset size for future predicted compute budgets. We investigate the growth in
data usage by estimating the total stock of unlabeled data available on the
internet over the coming decades. Our analysis indicates that the stock of
high-quality language data will be exhausted soon; likely before 2026. By
contrast, the stock of low-quality language data and image data will be
exhausted only much later; between 2030 and 2050 (for low-quality language) and
between 2030 and 2060 (for images). Our work suggests that the current trend of
ever-growing ML models that rely on enormous datasets might slow down if data
efficiency is not drastically improved or new sources of data become available.
Description
[2211.04325] Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning
%0 Generic
%1 villalobos2022analysis
%A Villalobos, Pablo
%A Sevilla, Jaime
%A Heim, Lennart
%A Besiroglu, Tamay
%A Hobbhahn, Marius
%A Ho, Anson
%D 2022
%K data dataset learning machine ml training
%T Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning
%U http://arxiv.org/abs/2211.04325
%X We analyze the growth of dataset sizes used in machine learning for natural
language processing and computer vision, and extrapolate these using two
methods; using the historical growth rate and estimating the compute-optimal
dataset size for future predicted compute budgets. We investigate the growth in
data usage by estimating the total stock of unlabeled data available on the
internet over the coming decades. Our analysis indicates that the stock of
high-quality language data will be exhausted soon; likely before 2026. By
contrast, the stock of low-quality language data and image data will be
exhausted only much later; between 2030 and 2050 (for low-quality language) and
between 2030 and 2060 (for images). Our work suggests that the current trend of
ever-growing ML models that rely on enormous datasets might slow down if data
efficiency is not drastically improved or new sources of data become available.
@misc{villalobos2022analysis,
abstract = {We analyze the growth of dataset sizes used in machine learning for natural
language processing and computer vision, and extrapolate these using two
methods; using the historical growth rate and estimating the compute-optimal
dataset size for future predicted compute budgets. We investigate the growth in
data usage by estimating the total stock of unlabeled data available on the
internet over the coming decades. Our analysis indicates that the stock of
high-quality language data will be exhausted soon; likely before 2026. By
contrast, the stock of low-quality language data and image data will be
exhausted only much later; between 2030 and 2050 (for low-quality language) and
between 2030 and 2060 (for images). Our work suggests that the current trend of
ever-growing ML models that rely on enormous datasets might slow down if data
efficiency is not drastically improved or new sources of data become available.},
added-at = {2023-03-10T10:29:32.000+0100},
author = {Villalobos, Pablo and Sevilla, Jaime and Heim, Lennart and Besiroglu, Tamay and Hobbhahn, Marius and Ho, Anson},
biburl = {https://www.bibsonomy.org/bibtex/2aceaaab707e3056b3f634639b480d85d/jaeschke},
description = {[2211.04325] Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning},
interhash = {2711e43eee9aebb3a1935653886fe453},
intrahash = {aceaaab707e3056b3f634639b480d85d},
keywords = {data dataset learning machine ml training},
note = {cite arxiv:2211.04325},
timestamp = {2023-03-10T10:29:32.000+0100},
title = {Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning},
url = {http://arxiv.org/abs/2211.04325},
year = 2022
}