I. Dhillon, S. Mallela, и D. Modha. Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining, стр. 89--98. New York, NY, USA, ACM, (2003)
DOI: 10.1145/956750.956764
Аннотация
Two-dimensional contingency or co-occurrence tables arise frequently in important applications such as text, web-log and market-basket data analysis. A basic problem in contingency table analysis is <i>co-clustering: simultaneous clustering</i> of the rows and columns. A novel theoretical formulation views the contingency table as an empirical joint probability distribution of two discrete random variables and poses the co-clustering problem as an optimization problem in <i>information theory</i>---the optimal co-clustering maximizes the mutual information between the clustered random variables subject to constraints on the number of row and column clusters. We present an innovative co-clustering algorithm that monotonically increases the preserved mutual information by intertwining both the row and column clusterings at all stages. Using the practical example of simultaneous word-document clustering, we demonstrate that our algorithm works well in practice, especially in the presence of sparsity and high-dimensionality.
%0 Conference Paper
%1 Dhillon:2003:IC:956750.956764
%A Dhillon, Inderjit S.
%A Mallela, Subramanyam
%A Modha, Dharmendra S.
%B Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining
%C New York, NY, USA
%D 2003
%I ACM
%K co-clustering information main thema thema:co-clustering theoretic
%P 89--98
%R 10.1145/956750.956764
%T Information-theoretic co-clustering
%U http://doi.acm.org/10.1145/956750.956764
%X Two-dimensional contingency or co-occurrence tables arise frequently in important applications such as text, web-log and market-basket data analysis. A basic problem in contingency table analysis is <i>co-clustering: simultaneous clustering</i> of the rows and columns. A novel theoretical formulation views the contingency table as an empirical joint probability distribution of two discrete random variables and poses the co-clustering problem as an optimization problem in <i>information theory</i>---the optimal co-clustering maximizes the mutual information between the clustered random variables subject to constraints on the number of row and column clusters. We present an innovative co-clustering algorithm that monotonically increases the preserved mutual information by intertwining both the row and column clusterings at all stages. Using the practical example of simultaneous word-document clustering, we demonstrate that our algorithm works well in practice, especially in the presence of sparsity and high-dimensionality.
%@ 1-58113-737-0
@inproceedings{Dhillon:2003:IC:956750.956764,
abstract = {Two-dimensional contingency or co-occurrence tables arise frequently in important applications such as text, web-log and market-basket data analysis. A basic problem in contingency table analysis is <i>co-clustering: simultaneous clustering</i> of the rows and columns. A novel theoretical formulation views the contingency table as an empirical joint probability distribution of two discrete random variables and poses the co-clustering problem as an optimization problem in <i>information theory</i>---the optimal co-clustering maximizes the mutual information between the clustered random variables subject to constraints on the number of row and column clusters. We present an innovative co-clustering algorithm that monotonically increases the preserved mutual information by intertwining both the row and column clusterings at all stages. Using the practical example of simultaneous word-document clustering, we demonstrate that our algorithm works well in practice, especially in the presence of sparsity and high-dimensionality.},
acmid = {956764},
added-at = {2013-04-08T11:03:49.000+0200},
address = {New York, NY, USA},
author = {Dhillon, Inderjit S. and Mallela, Subramanyam and Modha, Dharmendra S.},
biburl = {https://www.bibsonomy.org/bibtex/209da4110fbee857edabb7ecc64f68e02/becker},
booktitle = {Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining},
description = {Information-theoretic co-clustering},
doi = {10.1145/956750.956764},
interhash = {30fe4c22011ee3c5565d35709d9ce1f1},
intrahash = {09da4110fbee857edabb7ecc64f68e02},
isbn = {1-58113-737-0},
keywords = {co-clustering information main thema thema:co-clustering theoretic},
location = {Washington, D.C.},
numpages = {10},
pages = {89--98},
publisher = {ACM},
series = {KDD '03},
timestamp = {2013-04-19T09:34:55.000+0200},
title = {Information-theoretic co-clustering},
url = {http://doi.acm.org/10.1145/956750.956764},
year = 2003
}