@article{journals/corr/cs-CV-0312044, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/226f35d6a29010583da2a68eedaaf690e/dblp}, ee = {http://arxiv.org/abs/cs.CV/0312044}, interhash = {20d932eb13550f3d2ea91c8d2511bd7c}, intrahash = {26f35d6a29010583da2a68eedaaf690e}, journal = {CoRR}, keywords = {dblp}, title = {Clustering by compression}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0312.html#cs-CV-0312044}, volume = {cs.CV/0312044}, year = 2003 } @article{journals/corr/abs-cs-0612043, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Lotker, Zvi and Navarra, Alfredo and Pérennes, Stéphane and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/24ee6e45b8093dd9b85f4a7a7283c4073/dblp}, ee = {http://arxiv.org/abs/cs/0612043}, interhash = {7e0c610e3a63b6122c4ed49fcbd16c4b}, intrahash = {4ee6e45b8093dd9b85f4a7a7283c4073}, journal = {CoRR}, keywords = {dblp}, title = {About the Lifespan of Peer to Peer Networks}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0612.html#abs-cs-0612043}, volume = {abs/cs/0612043}, year = 2006 } @article{journals/corr/abs-cs-0412098, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/2104bb0bb1c60788ad33a2dcbb667a9c0/dblp}, ee = {http://arxiv.org/abs/cs/0412098}, interhash = {5e8ba9cd1aa478f3aa3c6ad1d452349f}, intrahash = {104bb0bb1c60788ad33a2dcbb667a9c0}, journal = {CoRR}, keywords = {dblp}, title = {The Google Similarity Distance}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0412.html#abs-cs-0412098}, volume = {abs/cs/0412098}, year = 2004 } @article{journals/corr/cs-SD-0303025, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B. and de Wolf, Ronald}, biburl = {http://www.bibsonomy.org/bibtex/2980e695898971b62190c24968d6dc4ba/dblp}, ee = {http://arxiv.org/abs/cs.SD/0303025}, interhash = {374f0e8ee7f66a0964c80c852ffaf1ec}, intrahash = {980e695898971b62190c24968d6dc4ba}, journal = {CoRR}, keywords = {dblp}, title = {Algorithmic Clustering of Music}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0303.html#cs-SD-0303025}, volume = {cs.SD/0303025}, year = 2003 } @article{journals/corr/abs-cs-0602065, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/2843814ffb84515f3d8473d783fd92bc0/dblp}, ee = {http://arxiv.org/abs/cs/0602065}, interhash = {5e7e8d31cb6cf6c2a9366ea7a671ec55}, intrahash = {843814ffb84515f3d8473d783fd92bc0}, journal = {CoRR}, keywords = {dblp}, title = {Similarity of Objects and the Meaning of Words}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0602.html#abs-cs-0602065}, volume = {abs/cs/0602065}, year = 2006 } @article{journals/corr/abs-0905-4039, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/2437af6bd0bd018d8ca52d51ecc169d40/dblp}, ee = {http://arxiv.org/abs/0905.4039}, interhash = {30d7c49f748c2e3f824cbaa35cdad46b}, intrahash = {437af6bd0bd018d8ca52d51ecc169d40}, journal = {CoRR}, keywords = {dblp}, title = {Normalized Web Distance and Word Similarity}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0905.html#abs-0905-4039}, volume = {abs/0905.4039}, year = 2009 } @article{journals/corr/abs-cs-0606048, added-at = {2011-12-05T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/24097e4eb25f3f99979360e6e13c40256/dblp}, ee = {http://arxiv.org/abs/cs/0606048}, interhash = {bd1b0c6ef15260c78943278c6ff6c5b9}, intrahash = {4097e4eb25f3f99979360e6e13c40256}, journal = {CoRR}, keywords = {dblp}, title = {A New Quartet Tree Heuristic for Hierarchical Clustering}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0606.html#abs-cs-0606048}, volume = {abs/cs/0606048}, year = 2006 } @article{journals/corr/abs-cs-0502068, added-at = {2011-12-05T00:00:00.000+0100}, author = {Tromp, John and Cilibrasi, Rudi}, biburl = {http://www.bibsonomy.org/bibtex/2f83e6f389f2afe8bb829abf62db308f0/dblp}, ee = {http://arxiv.org/abs/cs/0502068}, interhash = {45bbda9c6b05c6193c97b4198edb3ea3}, intrahash = {f83e6f389f2afe8bb829abf62db308f0}, journal = {CoRR}, keywords = {dblp}, title = {Limits of Rush Hour Logic Complexity}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0502.html#abs-cs-0502068}, volume = {abs/cs/0502068}, year = 2005 } @article{journals/corr/abs-0809-2553, added-at = {2011-12-05T00:00:00.000+0100}, author = {Vitányi, Paul M. B. and Balbach, Frank J. and Cilibrasi, Rudi and Li, Ming}, biburl = {http://www.bibsonomy.org/bibtex/2d9a48907c77e47a0097dfe2a6160e82c/dblp}, ee = {http://arxiv.org/abs/0809.2553}, interhash = {12cb7ef3318773a61b1a521600ee1277}, intrahash = {d9a48907c77e47a0097dfe2a6160e82c}, journal = {CoRR}, keywords = {dblp}, title = {Normalized Information Distance}, url = {http://dblp.uni-trier.de/db/journals/corr/corr0809.html#abs-0809-2553}, volume = {abs/0809.2553}, year = 2008 } @article{cs.CL/0412098, abstract = {We have found a method to automatically extract the meaning of words and phrases from the world-wide-web using Google page counts. The approach is novel in its unrestricted problem domain, simplicity of implementation, and manifestly ontological underpinnings. The world-wide-web is the largest database on earth, and the latent semantic context information entered by millions of independent users averages out to provide automatic meaning of useful quality. We demonstrate positive correlations, evidencing an underlying semantic structure, in both numerical symbol notations and number-name words in a variety of natural languages and contexts. Next, we demonstrate the ability to distinguish between colours and numbers, and to distinguish between 17th century Dutch painters; the ability to understand electrical terms, religious terms, and emergency incidents; we conduct a massive experiment in understanding WordNet categories; and finally we demonstrate the ability to do a simple automatic English-Spanish translation.}, added-at = {2011-11-23T16:24:13.000+0100}, author = {Cilibrasi, Rudi and Vitanyi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/250308d5168f519ce89a71fa67574ac25/gromgull}, interhash = {d0a6d81e08a236b41c69d12bad6406de}, intrahash = {50308d5168f519ce89a71fa67574ac25}, keywords = {distance-measure google machine-learning}, month = {15 March}, note = {v2}, notes = {ACM-class: I.2.4; I.2.7 Date (v1): Tue, 21 Dec 2004 16:05:36 GMT (127kb,S) Date (revised v2): Tue, 15 Mar 2005 16:53:43 GMT (58kb) cited by \cite{graham-rowe:2005:complearn} Code http://www.complearn.org/}, number = {cs.CL/0412098}, pages = {370-383}, size = {31 pages}, title = {Automatic Meaning Discovery Using Google}, url = {http://homepages.cwi.nl/~paulv/papers/amdug.pdf}, year = 2004 } @conference{cilibrasi_vitanyi_2006, added-at = {2011-04-14T07:06:28.000+0200}, address = { Seattle, USA}, author = {Cilibrasi, R. and Vitanyi, P.}, biburl = {http://www.bibsonomy.org/bibtex/2aadd39f3c0413848cbc72092e9ac4fb1/wyswilson}, booktitle = { Proceedings of the IEEE International Symposium on Information Theory}, interhash = {bf83f1151b49b769195862a47f82e61d}, intrahash = {aadd39f3c0413848cbc72092e9ac4fb1}, keywords = {imported}, title = { Automatic Extraction of Meaning from the Web}, year = { 2006} } @article{cilibrasi_vitanyi_2007, added-at = {2011-04-14T07:06:28.000+0200}, author = {Cilibrasi, R. and Vitanyi, P.}, biburl = {http://www.bibsonomy.org/bibtex/2cb260a59280a6312cdd532222ec10930/wyswilson}, interhash = {8fc73a93c327ea9a45ef793242ac3508}, intrahash = {cb260a59280a6312cdd532222ec10930}, journal = { IEEE Transactions on Knowledge and Data Engineering}, keywords = {imported}, number = { 3}, pages = { 370-383}, title = { The Google Similarity Distance}, url = {/brokenurl# http://publication.wilsonwong.me}, volume = { 19}, year = { 2007} } @incollection{vitanyi_et_al_2009, added-at = {2011-04-14T07:06:28.000+0200}, author = {Vitanyi, P. and Balbach, F. and Cilibrasi, R. and Li, M.}, biburl = {http://www.bibsonomy.org/bibtex/2d7887bf271cc02fda6713e9d64c7f0bf/wyswilson}, booktitle = { Information Theory and Statistical Learning}, editor = {Emmert-Streib, F. and Dehmer, M.}, interhash = {0e545ce78fa82c2e55e56e251aafc882}, intrahash = {d7887bf271cc02fda6713e9d64c7f0bf}, keywords = {imported}, publisher = { New-York: Springer}, title = { Normalized Information Distance}, url = {/brokenurl# http://publication.wilsonwong.me}, year = { 2009} } @article{google-sim, added-at = {2011-03-08T08:43:28.000+0100}, author = {Cilibrasi, Rudi L. and Vitáni, Paul M.B.}, biburl = {http://www.bibsonomy.org/bibtex/2e2ae8887137141b9d5f895b5333cf226/lina.wolf}, interhash = {04d3215af099ecec2b3cd04e81f4c629}, intrahash = {e2ae8887137141b9d5f895b5333cf226}, keywords = {google similarity}, title = {The Google Similarity Distance}, year = 2007 } @article{cilibrasi2007google, abstract = { Words and phrases acquire meaning from the way they are used in society, from their relative semantics to other words and phrases. For computers the equivalent of `society' is `database,' and the equivalent of `use' is `way to search the database.' We present a new theory of similarity between words and phrases based on information distance and Kolmogorov complexity. To fix thoughts we use the world-wide-web as database, and Google as search engine. The method is also applicable to other search engines and databases. This theory is then applied to construct a method to automatically extract similarity, the Google similarity distance, of words and phrases from the world-wide-web using Google page counts. The world-wide-web is the largest database on earth, and the context information entered by millions of independent users averages out to provide automatic semantics of useful quality. We give applications in hierarchical clustering, classification, and language translation. We give examples to distinguish between colors and numbers, cluster names of paintings by 17th century Dutch masters and names of books by English novelists, the ability to understand emergencies, and primes, and we demonstrate the ability to do a simple automatic English-Spanish translation. Finally, we use the WordNet database as an objective baseline against which to judge the performance of our method. We conduct a massive randomized trial in binary classification using support vector machines to learn categories based on our Google distance, resulting in an a mean agreement of 87% with the expert crafted WordNet categories.}, added-at = {2011-01-28T11:34:03.000+0100}, author = {Cilibrasi, Rudi and Vitanyi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/200ba496f53767b92d5965db71eeea8bf/dbenz}, description = {[cs/0412098] The Google Similarity Distance}, interhash = {8fc73a93c327ea9a45ef793242ac3508}, intrahash = {00ba496f53767b92d5965db71eeea8bf}, journal = {IEEE Transactions on Knowledge and Data Engineering}, keywords = {google relatedness_measures web_based imported}, pages = 370, title = {The Google Similarity Distance}, url = {http://www.citebase.org/abstract?id=oai:arXiv.org:cs/0412098}, volume = 19, year = 2007 } @article{cilibrasi2005clustering, abstract = { We present a new method for clustering based on compression. The method does not use subject-specific features or background knowledge, and works as follows: First, we determine a parameter-free, universal, similarity distance, the normalized compression distance or NCD, computed from the lengths of compressed data files (singly and in pairwise concatenation). Second, we apply a hierarchical clustering method. The NCD is not restricted to a specific application area, and works across application area boundaries. A theoretical precursor, the normalized information distance, co-developed by one of the authors, is provably optimal. However, the optimality comes at the price of using the noncomputable notion of Kolmogorov complexity. We propose axioms to capture the real-world setting, and show that the NCD approximates optimality. To extract a hierarchy of clusters from the distance matrix, we determine a dendrogram (ternary tree) by a new quartet method and a fast heuristic to implement it. The method is implemented and available as public software, and is robust under choice of different compressors. To substantiate our claims of universality and robustness, we report evidence of successful application in areas as diverse as genomics, virology, languages, literature, music, handwritten digits, astronomy, and combinations of objects from completely different domains, using statistical, dictionary, and block sorting compressors. In genomics, we presented new evidence for major questions in Mammalian evolution, based on whole-mitochondrial genomic analysis: the Eutherian orders and the Marsupionta hypothesis against the Theria hypothesis.}, added-at = {2011-01-28T11:32:21.000+0100}, author = {Cilibrasi, R. and Vitanyi, P.M.B.}, biburl = {http://www.bibsonomy.org/bibtex/25156d51daa332b82b27cc4665dbff1f5/dbenz}, doi = {10.1109/TIT.2005.844059}, interhash = {2016d3da3ebb9d17fdf0be152c2f2069}, intrahash = {5156d51daa332b82b27cc4665dbff1f5}, issn = {0018-9448}, journal = {IEEE Transactions on Information Theory}, keywords = {toread compression clustering}, month = {April}, number = 4, pages = { 1523-1545}, title = {Clustering by compression}, volume = 51, year = 2005 } @article{citeulike:4487, abstract = {{Words and phrases acquire meaning from the way they are used in society, from their relative semantics to other words and phrases. For computers the equivalent of `society' is `database,' and the equivalent of `use' is `way to search the database.' We present a new theory of similarity between words and phrases based on information distance and Kolmogorov complexity. To fix thoughts we use the world-wide-web as database, and Google as search engine. The method is also applicable to other search engines and databases. This theory is then applied to construct a method to automatically extract similarity, the Google similarity distance, of words and phrases from the world-wide-web using Google page counts. The world-wide-web is the largest database on earth, and the context information entered by millions of independent users averages out to provide automatic semantics of useful quality. We give applications in hierarchical clustering, classification, and language translation. We give examples to distinguish between colors and numbers, cluster names of paintings by 17th century Dutch masters and names of books by English novelists, the ability to understand emergencies, and primes, and we demonstrate the ability to do a simple automatic English-Spanish translation. Finally, we use the WordNet database as an objective baseline against which to judge the performance of our method. We conduct a massive randomized trial in binary classification using support vector machines to learn categories based on our Google distance, resulting in an a mean agreement of 87\% with the expert crafted WordNet categories.}}, added-at = {2010-12-17T18:47:41.000+0100}, archiveprefix = {arXiv}, author = {Cilibrasi, Rudi and Vitanyi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/24e823daa890d0bafff91045fd4bedb0b/mortimer_m8}, citeulike-article-id = {4487}, citeulike-linkout-0 = {http://arxiv.org/abs/cs.CL/0412098}, citeulike-linkout-1 = {http://arxiv.org/pdf/cs.CL/0412098}, day = 30, eprint = {cs.CL/0412098}, interhash = {8fc73a93c327ea9a45ef793242ac3508}, intrahash = {4e823daa890d0bafff91045fd4bedb0b}, keywords = {automatic-learning, google, linguistics, ontology, semantic}, month = May, posted-at = {2004-12-28 20:46:48}, priority = {4}, title = {{The Google Similarity Distance}}, url = {http://arxiv.org/abs/cs.CL/0412098}, year = 2007 } @article{journals/pr/CilibrasiV11, added-at = {2010-12-06T00:00:00.000+0100}, author = {Cilibrasi, Rudi and Vitányi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/29d1d18e90d3b3c38d04e78d0471800b8/dblp}, ee = {http://dx.doi.org/10.1016/j.patcog.2010.08.033}, interhash = {312a702ac7903cb4725b7063fe022afc}, intrahash = {9d1d18e90d3b3c38d04e78d0471800b8}, journal = {Pattern Recognition}, keywords = {dblp}, number = 3, pages = {662-677}, title = {A Fast Quartet tree heuristic for hierarchical clustering.}, url = {http://dblp.uni-trier.de/db/journals/pr/pr44.html#CilibrasiV11}, volume = 44, year = 2011 } @incollection{vitanyi_et_al_2009, added-at = {2010-11-26T09:32:26.000+0100}, author = {Vitanyi, P. and Balbach, F. and Cilibrasi, R. and Li, M.}, biburl = {http://www.bibsonomy.org/bibtex/25797b28e3963b54f8e748b764a3c0896/wyswilson}, booktitle = {Information Theory and Statistical Learning}, editor = {Emmert-Streib, F. and Dehmer, M.}, interhash = {0e545ce78fa82c2e55e56e251aafc882}, intrahash = {5797b28e3963b54f8e748b764a3c0896}, keywords = {imported}, publisher = {New-York: Springer}, title = {Normalized Information Distance}, url = {http://ontology.csse.uwa.edu.au/reference/browse_paper.php?pid=233282036}, year = 2009 } @conference{cilibrasi_vitanyi_2006, added-at = {2010-11-26T09:32:26.000+0100}, address = {Seattle, USA}, author = {Cilibrasi, R. and Vitanyi, P.}, biburl = {http://www.bibsonomy.org/bibtex/2049b2bfae97bca112c4153f4742f3ca1/wyswilson}, booktitle = {Proceedings of the IEEE International Symposium on Information Theory}, interhash = {bf83f1151b49b769195862a47f82e61d}, intrahash = {049b2bfae97bca112c4153f4742f3ca1}, keywords = {imported}, title = {Automatic Extraction of Meaning from the Web}, year = 2006 }