Large labeled training sets are the critical building blocks of supervised
learning methods and are key enablers of deep learning techniques. For some
applications, creating labeled training sets is the most time-consuming and
expensive part of applying machine learning. We therefore propose a paradigm
for the programmatic creation of training sets called data programming in which
users express weak supervision strategies or domain heuristics as labeling
functions, which are programs that label subsets of the data, but that are
noisy and may conflict. We show that by explicitly representing this training
set labeling process as a generative model, we can "denoise" the generated
training set, and establish theoretically that we can recover the parameters of
these generative models in a handful of settings. We then show how to modify a
discriminative loss function to make it noise-aware, and demonstrate our method
over a range of discriminative models including logistic regression and LSTMs.
Experimentally, on the 2014 TAC-KBP Slot Filling challenge, we show that data
programming would have led to a new winning score, and also show that applying
data programming to an LSTM model leads to a TAC-KBP score almost 6 F1 points
over a state-of-the-art LSTM baseline (and into second place in the
competition). Additionally, in initial user studies we observed that data
programming may be an easier way for non-experts to create machine learning
models when training data is limited or unavailable.
Description
[1605.07723] Data Programming: Creating Large Training Sets, Quickly
%0 Generic
%1 ratner2016programming
%A Ratner, Alexander
%A De Sa, Christopher
%A Wu, Sen
%A Selsam, Daniel
%A Ré, Christopher
%D 2016
%K 2016 arxiv data dataset paper
%T Data Programming: Creating Large Training Sets, Quickly
%U http://arxiv.org/abs/1605.07723
%X Large labeled training sets are the critical building blocks of supervised
learning methods and are key enablers of deep learning techniques. For some
applications, creating labeled training sets is the most time-consuming and
expensive part of applying machine learning. We therefore propose a paradigm
for the programmatic creation of training sets called data programming in which
users express weak supervision strategies or domain heuristics as labeling
functions, which are programs that label subsets of the data, but that are
noisy and may conflict. We show that by explicitly representing this training
set labeling process as a generative model, we can "denoise" the generated
training set, and establish theoretically that we can recover the parameters of
these generative models in a handful of settings. We then show how to modify a
discriminative loss function to make it noise-aware, and demonstrate our method
over a range of discriminative models including logistic regression and LSTMs.
Experimentally, on the 2014 TAC-KBP Slot Filling challenge, we show that data
programming would have led to a new winning score, and also show that applying
data programming to an LSTM model leads to a TAC-KBP score almost 6 F1 points
over a state-of-the-art LSTM baseline (and into second place in the
competition). Additionally, in initial user studies we observed that data
programming may be an easier way for non-experts to create machine learning
models when training data is limited or unavailable.
@misc{ratner2016programming,
abstract = {Large labeled training sets are the critical building blocks of supervised
learning methods and are key enablers of deep learning techniques. For some
applications, creating labeled training sets is the most time-consuming and
expensive part of applying machine learning. We therefore propose a paradigm
for the programmatic creation of training sets called data programming in which
users express weak supervision strategies or domain heuristics as labeling
functions, which are programs that label subsets of the data, but that are
noisy and may conflict. We show that by explicitly representing this training
set labeling process as a generative model, we can "denoise" the generated
training set, and establish theoretically that we can recover the parameters of
these generative models in a handful of settings. We then show how to modify a
discriminative loss function to make it noise-aware, and demonstrate our method
over a range of discriminative models including logistic regression and LSTMs.
Experimentally, on the 2014 TAC-KBP Slot Filling challenge, we show that data
programming would have led to a new winning score, and also show that applying
data programming to an LSTM model leads to a TAC-KBP score almost 6 F1 points
over a state-of-the-art LSTM baseline (and into second place in the
competition). Additionally, in initial user studies we observed that data
programming may be an easier way for non-experts to create machine learning
models when training data is limited or unavailable.},
added-at = {2018-09-19T17:31:54.000+0200},
author = {Ratner, Alexander and De Sa, Christopher and Wu, Sen and Selsam, Daniel and Ré, Christopher},
biburl = {https://www.bibsonomy.org/bibtex/2fc3913524049786a7414f77134b5dff5/analyst},
description = {[1605.07723] Data Programming: Creating Large Training Sets, Quickly},
interhash = {57950bb971c095361d1baafb06bd543a},
intrahash = {fc3913524049786a7414f77134b5dff5},
keywords = {2016 arxiv data dataset paper},
note = {cite arxiv:1605.07723},
timestamp = {2018-09-19T17:31:54.000+0200},
title = {Data Programming: Creating Large Training Sets, Quickly},
url = {http://arxiv.org/abs/1605.07723},
year = 2016
}