@article{Cilibrasi:2007, abstract = { Words and phrases acquire meaning from the way they are used in society, from their relative semantics to other words and phrases. For computers the equivalent of `society' is `database,' and the equivalent of `use' is `way to search the database.' We present a new theory of similarity between words and phrases based on information distance and Kolmogorov complexity. To fix thoughts we use the world-wide-web as database, and Google as search engine. The method is also applicable to other search engines and databases. This theory is then applied to construct a method to automatically extract similarity, the Google similarity distance, of words and phrases from the world-wide-web using Google page counts. The world-wide-web is the largest database on earth, and the context information entered by millions of independent users averages out to provide automatic semantics of useful quality. We give applications in hierarchical clustering, classification, and language translation. We give examples to distinguish between colors and numbers, cluster names of paintings by 17th century Dutch masters and names of books by English novelists, the ability to understand emergencies, and primes, and we demonstrate the ability to do a simple automatic English-Spanish translation. Finally, we use the WordNet database as an objective baseline against which to judge the performance of our method. We conduct a massive randomized trial in binary classification using support vector machines to learn categories based on our Google distance, resulting in an a mean agreement of 87% with the expert crafted WordNet categories.}, added-at = {2008-02-14T07:15:55.000+0100}, author = {Cilibrasi, Rudi and Vitanyi, Paul M. B.}, biburl = {http://www.bibsonomy.org/bibtex/200ba496f53767b92d5965db71eeea8bf/diego_ma}, interhash = {8fc73a93c327ea9a45ef793242ac3508}, intrahash = {00ba496f53767b92d5965db71eeea8bf}, journal = {IEEE Transactions on Knowledge and Data Engineering}, keywords = {semantic_closeness web}, pages = 370, timestamp = {2008-02-14T07:15:55.000+0100}, title = {The Google Similarity Distance}, url = {http://www.citebase.org/abstract?id=oai:arXiv.org:cs/0412098}, volume = 19, year = 2007 } @article{Resnik:1999, abstract = {This article presents a measure of semantic similarity in an IS-A taxonomy based on the notion of shared information content. Experimental evaluation against a benchmark set of human similarity judgments demonstrates that the measure performs better than the traditional edge-counting approach. The article presents algorithms that take advantage of taxonomic similarity in resolving syntactic and semantic ambiguity, along with experimental results demonstrating their effectiveness.}, added-at = {2007-12-14T02:45:37.000+0100}, author = {Resnik, Philip}, biburl = {http://www.bibsonomy.org/bibtex/23d2969db6df305b60ee5ce220045c0cc/diego_ma}, interhash = {f10519367ccaa8a06ce5bc02ccec3270}, intrahash = {3d2969db6df305b60ee5ce220045c0cc}, journal = {Journal of Artificial Intelligence Research}, keywords = {WordNet semantic_closeness}, pages = {95-130}, timestamp = {2007-12-14T02:45:37.000+0100}, title = {Semantic Similarity in a Taxonomy: An Information-Based Measure and its Application to Problems of Ambiguity in Natural Language}, url = {http://www.cs.washington.edu/research/jair/abstracts/resnik99a.html}, volume = 11, year = 1998 } @inproceedings{Montes:2000, abstract = {The use of conceptual graphs for the representation of text contents in information retrieval is discussed. A method for measuring the similarity between two texts represented as conceptual graphs is presented. The method is based on well-known strategies of text comparison, such as Dice coefficient, with new elements indroduced due to the bipartite nature of the conceptual graphs. Examples of the representation and comparison of the phrases are given. The structure of an information retrieval system using two-level document representation, traditional keywords conceptual graphs, is presented.}, added-at = {2007-12-14T02:44:10.000+0100}, author = {{Montes-y-G{\'o}mez}, Manuel and L{\'o}pez-L{\'o}pez, Aurelio and Gelbukh, Alexander}, biburl = {http://www.bibsonomy.org/bibtex/246ca669c1a9508c05bb687945dbc5407/diego_ma}, booktitle = {Proc. DEXA-2000}, interhash = {ad82a928180ef0ca8f7bcdb285c5f042}, intrahash = {46ca669c1a9508c05bb687945dbc5407}, keywords = {inf_retrieval semantic_closeness}, number = 1873, pages = {312-321}, publisher = {Springer-Verlag}, series = {Lecture Notes in Computer Science}, timestamp = {2007-12-14T02:44:10.000+0100}, title = {Information Retrieval with Conceptual Graph Matching}, year = 2000 } @inproceedings{Montes:2001, abstract = {Conceptual graphs allow for powerful and computationally affordable representation of the semantic contents of natural language texts. We propose a method of comparison (approximate matching) of conceptual graphs. The method takes into account synonymy and subtype/supertype relationships between the concepts and relations used in the conceptual graphs, thus allowing for greater flexibility of approximate matching. The method also allows the user to choose the desirable aspect of similarity in the cases when the two graphs can be generalized in different ways. The algorithm and examples of its application are presented. The results are potentially useful in a range of tasks requiring approximate semantic or another structural matching ? among them, information retrieval and text mining.}, added-at = {2007-12-14T02:44:08.000+0100}, author = {{Montes-y-G{\'o}mez}, Manuel and Gelbukh, Alexander and Baeza-Yates, Ricardo}, biburl = {http://www.bibsonomy.org/bibtex/256fc5f404aea37d8e99b2ae8fa068f58/diego_ma}, booktitle = {Proc. DEXA-2001}, interhash = {9856d735d3660a527040d4eb6801cc8f}, intrahash = {56fc5f404aea37d8e99b2ae8fa068f58}, keywords = {inf_retrieval semantic_closeness}, number = 2113, pages = {102-111}, publisher = {Springer-Verlag}, series = {Lecture Notes in Computer Science}, timestamp = {2007-12-14T02:44:08.000+0100}, title = {Flexible Comparison of Conceptual Graphs}, year = 2001 } @inproceedings{Montes:2002, abstract = {Text mining is defined as knowledge discovery in large text collections. It detects interesting patterns such as clusters, associations, deviations, similarities, and differences in sets of texts. Current text mining methods use simplistic representations of text contents, such as keyword vectors, which imply serious limitations on the kind and meaningfulness of possible discoveries. We show how to do some typical mining tasks using conceptual graphs as formal but meaningful representation of texts. Our methods involve qualitative and quantitative comparison of conceptual graphs, conceptual clustering, building a conceptual hierarchy, and application of data mining techniques to this hierarchy in order to detect interesting associations and deviations. Our experiments show that, despite widespread misbelief, detailed meaningful mining with conceptual graphs is computationally affordable.}, added-at = {2007-12-14T02:44:07.000+0100}, author = {y G{\'o}mez, Manuel Montes and Gelbukh, Alexander and L{\'o}pez-L{\'o}pez, Aurelio}, biburl = {http://www.bibsonomy.org/bibtex/275561b4bb59407a24c6af5d62fb4d04c/diego_ma}, booktitle = {Proc. DEXA-2002}, interhash = {e071c5cd86614ac00f388268973e245d}, intrahash = {75561b4bb59407a24c6af5d62fb4d04c}, keywords = {semantic_closeness}, number = 2393, pages = {122-136}, publisher = {Springer-Verlag}, series = {Lecture Notes in Artificial Intelligence}, timestamp = {2007-12-14T02:44:07.000+0100}, title = {Text Mining at Detail Level Using Conceptual Graphs}, year = 2002 } @inproceedings{Litkowski:1999, added-at = {2007-12-14T02:42:38.000+0100}, author = {Litkowski, Kenneth C.}, biburl = {http://www.bibsonomy.org/bibtex/25fdb9c18941f7a0880b94054e029f0e3/diego_ma}, booktitle = {Proc. ACL-SIGLEX99}, interhash = {c1fbe74e8823becc2fc91041b6458d4a}, intrahash = {5fdb9c18941f7a0880b94054e029f0e3}, keywords = {lexical_resources semantic_closeness}, timestamp = {2007-12-14T02:42:38.000+0100}, title = {Towards a Meaning-Full Comparison of Lexical Resources}, url = {http://www.clres.com}, year = 1999 }