Recent empirical works have successfully used unlabeled data to learn feature
representations that are broadly useful in downstream classification tasks.
Several of these methods are reminiscent of the well-known word2vec embedding
algorithm: leveraging availability of pairs of semantically "similar" data
points and "negative samples," the learner forces the inner product of
representations of similar pairs with each other to be higher on average than
with negative samples. The current paper uses the term contrastive learning for
such algorithms and presents a theoretical framework for analyzing them by
introducing latent classes and hypothesizing that semantically similar points
are sampled from the same latent class. This framework allows us to show
provable guarantees on the performance of the learned representations on the
average classification task that is comprised of a subset of the same set of
latent classes. Our generalization bound also shows that learned
representations can reduce (labeled) sample complexity on downstream tasks. We
conduct controlled experiments in both the text and image domains to support
the theory.
Description
[1902.09229] A Theoretical Analysis of Contrastive Unsupervised Representation Learning
%0 Journal Article
%1 arora2019theoretical
%A Arora, Sanjeev
%A Khandeparkar, Hrishikesh
%A Khodak, Mikhail
%A Plevrakis, Orestis
%A Saunshi, Nikunj
%D 2019
%K theory
%T A Theoretical Analysis of Contrastive Unsupervised Representation
Learning
%U http://arxiv.org/abs/1902.09229
%X Recent empirical works have successfully used unlabeled data to learn feature
representations that are broadly useful in downstream classification tasks.
Several of these methods are reminiscent of the well-known word2vec embedding
algorithm: leveraging availability of pairs of semantically "similar" data
points and "negative samples," the learner forces the inner product of
representations of similar pairs with each other to be higher on average than
with negative samples. The current paper uses the term contrastive learning for
such algorithms and presents a theoretical framework for analyzing them by
introducing latent classes and hypothesizing that semantically similar points
are sampled from the same latent class. This framework allows us to show
provable guarantees on the performance of the learned representations on the
average classification task that is comprised of a subset of the same set of
latent classes. Our generalization bound also shows that learned
representations can reduce (labeled) sample complexity on downstream tasks. We
conduct controlled experiments in both the text and image domains to support
the theory.
@article{arora2019theoretical,
abstract = {Recent empirical works have successfully used unlabeled data to learn feature
representations that are broadly useful in downstream classification tasks.
Several of these methods are reminiscent of the well-known word2vec embedding
algorithm: leveraging availability of pairs of semantically "similar" data
points and "negative samples," the learner forces the inner product of
representations of similar pairs with each other to be higher on average than
with negative samples. The current paper uses the term contrastive learning for
such algorithms and presents a theoretical framework for analyzing them by
introducing latent classes and hypothesizing that semantically similar points
are sampled from the same latent class. This framework allows us to show
provable guarantees on the performance of the learned representations on the
average classification task that is comprised of a subset of the same set of
latent classes. Our generalization bound also shows that learned
representations can reduce (labeled) sample complexity on downstream tasks. We
conduct controlled experiments in both the text and image domains to support
the theory.},
added-at = {2019-03-21T20:49:31.000+0100},
author = {Arora, Sanjeev and Khandeparkar, Hrishikesh and Khodak, Mikhail and Plevrakis, Orestis and Saunshi, Nikunj},
biburl = {https://www.bibsonomy.org/bibtex/231a698a8a22f432b730d64dc16df9406/kirk86},
description = {[1902.09229] A Theoretical Analysis of Contrastive Unsupervised Representation Learning},
interhash = {e01698f23de714954ccdf09d088c8608},
intrahash = {31a698a8a22f432b730d64dc16df9406},
keywords = {theory},
note = {cite arxiv:1902.09229Comment: 19 pages, 5 figures},
timestamp = {2019-03-21T20:49:31.000+0100},
title = {A Theoretical Analysis of Contrastive Unsupervised Representation
Learning},
url = {http://arxiv.org/abs/1902.09229},
year = 2019
}