We define the relevant information in a signal $xX$ as being the
information that this signal provides about another signal $y\Y$. Examples
include the information that face images provide about the names of the people
portrayed, or the information that speech sounds provide about the words
spoken. Understanding the signal $x$ requires more than just predicting $y$, it
also requires specifying which features of $\X$ play a role in the prediction.
We formalize this problem as that of finding a short code for $\X$ that
preserves the maximum information about $\Y$. That is, we squeeze the
information that $\X$ provides about $\Y$ through a `bottleneck' formed by a
limited set of codewords $\tX$. This constrained optimization problem can be
seen as a generalization of rate distortion theory in which the distortion
measure $d(x,\x)$ emerges from the joint statistics of $\X$ and $\Y$. This
approach yields an exact set of self consistent equations for the coding rules
$X \tX$ and $\tX \Y$. Solutions to these equations can be found by a
convergent re-estimation method that generalizes the Blahut-Arimoto algorithm.
Our variational principle provides a surprisingly rich framework for discussing
a variety of problems in signal processing and learning, as will be described
in detail elsewhere.
Description
[physics/0004057v1] The information bottleneck method
%0 Generic
%1 tishby2000information
%A Tishby, Naftali
%A Pereira, Fernando C.
%A Bialek, William
%D 2000
%K SHOULDREAD fulltext information-bottleneck machine-learning
%T The information bottleneck method
%U http://arxiv.org/abs/physics/0004057
%X We define the relevant information in a signal $xX$ as being the
information that this signal provides about another signal $y\Y$. Examples
include the information that face images provide about the names of the people
portrayed, or the information that speech sounds provide about the words
spoken. Understanding the signal $x$ requires more than just predicting $y$, it
also requires specifying which features of $\X$ play a role in the prediction.
We formalize this problem as that of finding a short code for $\X$ that
preserves the maximum information about $\Y$. That is, we squeeze the
information that $\X$ provides about $\Y$ through a `bottleneck' formed by a
limited set of codewords $\tX$. This constrained optimization problem can be
seen as a generalization of rate distortion theory in which the distortion
measure $d(x,\x)$ emerges from the joint statistics of $\X$ and $\Y$. This
approach yields an exact set of self consistent equations for the coding rules
$X \tX$ and $\tX \Y$. Solutions to these equations can be found by a
convergent re-estimation method that generalizes the Blahut-Arimoto algorithm.
Our variational principle provides a surprisingly rich framework for discussing
a variety of problems in signal processing and learning, as will be described
in detail elsewhere.
@misc{tishby2000information,
abstract = {We define the relevant information in a signal $x\in X$ as being the
information that this signal provides about another signal $y\in \Y$. Examples
include the information that face images provide about the names of the people
portrayed, or the information that speech sounds provide about the words
spoken. Understanding the signal $x$ requires more than just predicting $y$, it
also requires specifying which features of $\X$ play a role in the prediction.
We formalize this problem as that of finding a short code for $\X$ that
preserves the maximum information about $\Y$. That is, we squeeze the
information that $\X$ provides about $\Y$ through a `bottleneck' formed by a
limited set of codewords $\tX$. This constrained optimization problem can be
seen as a generalization of rate distortion theory in which the distortion
measure $d(x,\x)$ emerges from the joint statistics of $\X$ and $\Y$. This
approach yields an exact set of self consistent equations for the coding rules
$X \to \tX$ and $\tX \to \Y$. Solutions to these equations can be found by a
convergent re-estimation method that generalizes the Blahut-Arimoto algorithm.
Our variational principle provides a surprisingly rich framework for discussing
a variety of problems in signal processing and learning, as will be described
in detail elsewhere.},
added-at = {2017-10-08T20:37:13.000+0200},
author = {Tishby, Naftali and Pereira, Fernando C. and Bialek, William},
biburl = {https://www.bibsonomy.org/bibtex/205458d11eac1270aa0b3648e2d4a0ed9/marcsaric},
description = {[physics/0004057v1] The information bottleneck method},
interhash = {c961d043b8ecf5a5d46a9561ff771aec},
intrahash = {05458d11eac1270aa0b3648e2d4a0ed9},
keywords = {SHOULDREAD fulltext information-bottleneck machine-learning},
note = {cite arxiv:physics/0004057},
timestamp = {2017-10-08T20:37:13.000+0200},
title = {The information bottleneck method},
url = {http://arxiv.org/abs/physics/0004057},
year = 2000
}