We propose an approach to self-supervised representation learning based on
maximizing mutual information between features extracted from multiple views of
a shared context. For example, one could produce multiple views of a local
spatio-temporal context by observing it from different locations (e.g., camera
positions within a scene), and via different modalities (e.g., tactile,
auditory, or visual). Or, an ImageNet image could provide a context from which
one produces multiple views by repeatedly applying data augmentation.
Maximizing mutual information between features extracted from these views
requires capturing information about high-level factors whose influence spans
multiple views -- e.g., presence of certain objects or occurrence of certain
events.
Following our proposed approach, we develop a model which learns image
representations that significantly outperform prior methods on the tasks we
consider. Most notably, using self-supervised learning, our model learns
representations which achieve 68.1% accuracy on ImageNet using standard linear
evaluation. This beats prior results by over 12% and concurrent results by 7%.
When we extend our model to use mixture-based representations, segmentation
behaviour emerges as a natural side-effect. Our code is available online:
https://github.com/Philip-Bachman/amdim-public.
%0 Generic
%1 bachman2019learning
%A Bachman, Philip
%A Hjelm, R Devon
%A Buchwalter, William
%D 2019
%K representation_learning
%T Learning Representations by Maximizing Mutual Information Across Views
%U http://arxiv.org/abs/1906.00910
%X We propose an approach to self-supervised representation learning based on
maximizing mutual information between features extracted from multiple views of
a shared context. For example, one could produce multiple views of a local
spatio-temporal context by observing it from different locations (e.g., camera
positions within a scene), and via different modalities (e.g., tactile,
auditory, or visual). Or, an ImageNet image could provide a context from which
one produces multiple views by repeatedly applying data augmentation.
Maximizing mutual information between features extracted from these views
requires capturing information about high-level factors whose influence spans
multiple views -- e.g., presence of certain objects or occurrence of certain
events.
Following our proposed approach, we develop a model which learns image
representations that significantly outperform prior methods on the tasks we
consider. Most notably, using self-supervised learning, our model learns
representations which achieve 68.1% accuracy on ImageNet using standard linear
evaluation. This beats prior results by over 12% and concurrent results by 7%.
When we extend our model to use mixture-based representations, segmentation
behaviour emerges as a natural side-effect. Our code is available online:
https://github.com/Philip-Bachman/amdim-public.
@misc{bachman2019learning,
abstract = {We propose an approach to self-supervised representation learning based on
maximizing mutual information between features extracted from multiple views of
a shared context. For example, one could produce multiple views of a local
spatio-temporal context by observing it from different locations (e.g., camera
positions within a scene), and via different modalities (e.g., tactile,
auditory, or visual). Or, an ImageNet image could provide a context from which
one produces multiple views by repeatedly applying data augmentation.
Maximizing mutual information between features extracted from these views
requires capturing information about high-level factors whose influence spans
multiple views -- e.g., presence of certain objects or occurrence of certain
events.
Following our proposed approach, we develop a model which learns image
representations that significantly outperform prior methods on the tasks we
consider. Most notably, using self-supervised learning, our model learns
representations which achieve 68.1% accuracy on ImageNet using standard linear
evaluation. This beats prior results by over 12% and concurrent results by 7%.
When we extend our model to use mixture-based representations, segmentation
behaviour emerges as a natural side-effect. Our code is available online:
https://github.com/Philip-Bachman/amdim-public.},
added-at = {2019-07-09T15:55:56.000+0200},
author = {Bachman, Philip and Hjelm, R Devon and Buchwalter, William},
biburl = {https://www.bibsonomy.org/bibtex/2c3910d0596215170d2df9c0ff148253b/topel},
interhash = {b76a2430694c5adecd36654ca863eb98},
intrahash = {c3910d0596215170d2df9c0ff148253b},
keywords = {representation_learning},
note = {cite arxiv:1906.00910},
timestamp = {2019-07-09T15:55:56.000+0200},
title = {Learning Representations by Maximizing Mutual Information Across Views},
url = {http://arxiv.org/abs/1906.00910},
year = 2019
}