@article{keyhere, title = {Algorithmic Computation and Approximation of Semantic Similarity}, author = {Ana Maguitman and Filippo Menczer and Fulya Erdinc and Heather Roinestad and Alessandro Vespignani}, journal = {World Wide Web}, month = {#dec#}, number = 4, pages = {431--456}, volume = 9, year = 2006, url = {http://dx.doi.org/10.1007/s11280-006-8562-2}, description = {SpringerLink - Journal Article}, abstract = {Automatic extraction of semantic information from text and links in Web pages is key to improving the quality of search results. However, the assessment of automatic semantic measures is limited by the coverage of user studies, which do not scale withthe size, heterogeneity, and growth of the Web. Here we propose to leverage human-generated metadata—namely topical directories—tomeasure semantic relationships among massive numbers of pairs of Web pages or topics. The Open Directory Project classifiesmillions of URLs in a topical ontology, providing a rich source from which semantic relationships between Web pages can bederived. While semantic similarity measures based on taxonomies (trees) are well studied, the design of well-founded similaritymeasures for objects stored in the nodes of arbitrary ontologies (graphs) is an open problem. This paper defines an information-theoreticmeasure of semantic similarity that exploits both the hierarchical and non-hierarchical structure of an ontology. An experimentalstudy shows that this measure improves significantly on the traditional taxonomy-based approach. This novel measure allowsus to address the general question of how text and link analyses can be combined to derive measures of relevance that arein good agreement with semantic similarity. Surprisingly, the traditional use of text similarity turns out to be ineffectivefor relevance ranking.}, biburl = {http://www.bibsonomy.org/bibtex/2bd9b446bb86a758290ca2107abeb3de3/andreab}, keywords = {opendirectory text semantic similarity validation imported} } @article{vito2005, title = {Can simple models explain Zipf's law for all exponents?}, author = {Ramon Ferrer i Cancho and Vito D.P.Servedio}, journal = {Glottometrics}, pages = {1-8}, volume = 11, year = 2005, abstract = {H. Simon proposed a simple stochastic process for explaining Zipf’s law for word frequencies. Here we introduce two similar generalizations of Simon’s model that cover the same range of exponents as the standard Simon model. The mathematical approach followed minimizes the amount of mathematical background needed for deriving the exponent, compared to previous approaches to the standard Simon’s model. Reviewing what is known from other simple explanations of Zipf’s law, we conclude there is no single radically simple explanation covering the whole range of variation of the exponent of Zipf’s law in humans. The meaningfulness of Zipf’s law for word frequencies remains an open question. }, biburl = {http://www.bibsonomy.org/bibtex/2e33a94a50df2557c8dade8b77ee76e21/andreab}, keywords = {linguistics zipf vito text d4.1 statistics tagora} } @book{glottometrics4, title = {Glottometrics 4, To honor G.K. Zipf}, booktitle = {To honor G.K. Zipf}, editor = {RAM Verlag}, journal = {Glottometrics}, volume = 4, year = 2002, biburl = {http://www.bibsonomy.org/bibtex/278a78ea7fc7442e1ef280c7bfcf94b71/andreab}, keywords = {d4.1 book statistics zipf text tagora linguistics} } @article{koutsoudas1957, title = {Mechanical Translation and Zipf's Law}, author = {Andreas Koutsoudas}, journal = {Language}, number = 4, pages = {545-552}, publisher = {Linguistic Society of America}, volume = 33, year = 1957, url = {http://links.jstor.org/sici?sici=0097-8507%28195710%2F12%2933%3A4%3C545%3AMTAZL%3E2.0.CO%3B2-Q}, abstract = {A problem which arises in the course of research on mechanical translation is the prediction of dictionary size. This article investigates the relation between empirical frequency laws and the function V(n)-the expected number of different words in an n-word sample of text. It is found that the probability-law proposed by Joos (1936) yields results which do not check well with experiments, and it is concluded that some modification of it is necessary for the purpose of vocabulary prediction.}, biburl = {http://www.bibsonomy.org/bibtex/2d2041174b1dca948c7183399f9a76a20/andreab}, keywords = {zipf dictionary words statistics size tagora text d4.1 linguistics law} }