@phdthesis{Curran:2003, title = {{From Distributional to Semantic Similarity}}, author = {James Richard Curran}, school = {Institute for Communicating and Collaborative Systems School of Informatics University of Edinburgh}, year = 2003, url = {http://www.era.lib.ed.ac.uk/bitstream/1842/563/2/IP030023.pdf }, added = {2007-12-03 15:18:56 -0500}, group = {Data Mining; Knowledge Organization}, modified = {2008-07-04 12:38:50 -0400}, abstract = {Lexical-semantic resources, including thesauri and WOR DNE T, have been successfully incor- porated into a wide range of applications in Natural Language Processing. However they are very difficult and expensive to create and maintain, and their usefulness has been severely hampered by their limited coverage, bias and inconsistency. Automated and semi-automated methods for developing such resources are therefore crucial for further resource development and improved application performance. Systems that extract thesauri often identify similar words using the distributional hypothesis that similar words appear in similar contexts. This approach involves using corpora to examine the contexts each word appears in and then calculating the similarity between context distri- butions. Different definitions of context can be used, and I begin by examining how different types of extracted context influence similarity. To be of most benefit these systems must be capable of finding synonyms for rare words. Reliable context counts for rare events can only be extracted from vast collections of text. In this dissertation I describe how to extract contexts from a corpus of over 2 billion words. I describe techniques for processing text on this scale and examine the trade-off between context accuracy, information content and quantity of text analysed. Distributional similarity is at best an approximation to semantic similarity. I develop improved approximations motivated by the intuition that some events in the context distribution are more indicative of meaning than others. For instance, the object-of-verb context wear is far more indicative of a clothing noun than get. However, existing distributional techniques do not effectively utilise this information. The new context-weighted similarity metric I propose in this dissertation significantly outperforms every distributional similarity metric described in the literature. Nearest-neighbour similarity algorithms scale poorly with vocabulary and context vector size. To overcome this problem I introduce a new context-weighted approximation algorithm with bounded complexity in context vector size that significantly reduces the system runtime with only a minor performance penalty. I also describe a parallelized version of the system that runs on a Beowulf cluster for the 2 billion word experiments. To evaluate the context-weighted similarity measure I compare ranked similarity lists against gold-standard resources using precision and recall-based measures from Information Retrieval, since the alternative, application-based evaluation, can often be influenced by distributional as well as semantic similarity. I also perform a detailed analysis of the final results using WOR DNE T. Finally, I apply my similarity metric to the task of assigning words to WOR DNE T semantic categories. I demonstrate that this new approach outperforms existing methods and overcomes some of their weaknesses. }, biburl = {http://www.bibsonomy.org/bibtex/257de9154de9e4848eb5989f9ca7fdcbb/hotho}, keywords = {similarity semantic wordnet toread distributional} } @article{Fullerton2007, title = {The general equilibrium incidence of environmental taxes}, author = {Don Fullerton and Garth Heutel}, journal = {Journal of Public Economics}, month = {Apr}, number = {3-4}, pages = {571--591}, volume = 91, year = 2007, url = {http://www.sciencedirect.com/science/article/B6V76-4KW5W80-1/1/7fed32fd75efad965e20e353e86b45df}, biburl = {http://www.bibsonomy.org/bibtex/2998a566fa99515e5b4dc67d00f9dab72/smicha}, keywords = {Distributional burdens} } @article{Saleh1987, title = {On the asymptotic distributional risk properties of pre-test and shrinkage L1-estimators}, author = {A. K. Md. Ehsanes Saleh and Pranab K. Sen}, journal = {Computational Statistics \& Data Analysis}, month = {Sep}, number = 4, pages = {289--299}, volume = 5, year = 1987, url = {http://www.sciencedirect.com/science/article/B6V8V-476DTGG-6/1/5eee7efa615471cbc7791560d25cf7fe}, biburl = {http://www.bibsonomy.org/bibtex/2559f715f3ec79e5fd39a3630bfbef6bd/smicha}, keywords = {risk Asymptotic distributional} } @article{Caliebe2003, title = {Fixed points with finite variance of a smoothing transformation}, author = {Amke Caliebe and Uwe R{\"o}sler}, journal = {Stochastic Processes and their Applications}, month = {Sep}, number = 1, pages = {105--129}, volume = 107, year = 2003, url = {http://www.sciencedirect.com/science/article/B6V1B-48TK8JH-1/1/f7176102c1f427b9c5fbc3336ed557f9}, description = {Stochastic Processes and their Applications}, biburl = {http://www.bibsonomy.org/bibtex/26df9af947020e5f9dfd40cfed5e1f791/smicha}, keywords = {point Distributional equations fixed} } @article{Alexeev1995, title = {Distributional constraints on the speed of privatization}, author = {Michael Alexeev and Michael Kaganovich}, journal = {Economics Letters}, month = {May}, number = 2, pages = {213--219}, volume = 48, year = 1995, url = {http://www.sciencedirect.com/science/article/B6V84-3YVD01D-1R/2/ca9b6037868b514842f030d68c1139d1}, description = {Economics Letters}, biburl = {http://www.bibsonomy.org/bibtex/24188d2f9a0c16293114351330188c9c4/smicha}, keywords = {Distributional constraints} } @article{Bennett1998, title = {Prices versus quantities and distributional inefficiency}, author = {John Bennett}, journal = {Economics Letters}, month = {Jan}, number = 1, pages = {63--67}, volume = 58, year = 1998, day = 01, url = {http://www.sciencedirect.com/science/article/B6V84-3T51RH8-1J/2/f22b50c2909f0c7df63f0473c00ecdf3}, description = {Economics Letters}, biburl = {http://www.bibsonomy.org/bibtex/2cf4f9385cbea7d64c741f0b3e1e336f7/smicha}, keywords = {inefficiency Distributional} } @article{Yang2003b, title = {A score test for Box-Cox functional form}, author = {Zhenlin Yang and Tilak Abeysinghe}, journal = {Economics Letters}, month = {Apr}, number = 1, pages = {107--115}, volume = 79, year = 2003, url = {http://www.sciencedirect.com/science/article/B6V84-47PPDDJ-7/2/6c95d0f62cc83afe70d4605cbae49204}, description = {Economics Letters}, biburl = {http://www.bibsonomy.org/bibtex/24e60022c0965ba06f322000404efb1db/smicha}, keywords = {Distributional properties} } @article{Somanathan2006, title = {Valuing lives equally: Distributional weights for welfare analysis}, author = {E. Somanathan}, journal = {Economics Letters}, month = {Jan}, number = 1, pages = {122--125}, volume = 90, year = 2006, url = {http://www.sciencedirect.com/science/article/B6V84-4H6XKRX-2/2/b829fd3f65bc1d6cacd93559e23c51dd}, description = {Economics Letters}, biburl = {http://www.bibsonomy.org/bibtex/27e28e29d95b182d76a969e753460b870/smicha}, keywords = {weights Distributional} } @article{mohammadSubmittedDistributional, title = {Distributional measures as proxies for semantic relatedness}, author = {Saif Mohammad and Graeme Hirst}, year = {Submitted for publication}, url = {http://ftp.cs.toronto.edu/pub/gh/Mohammad+Hirst-2005.pdf}, biburl = {http://www.bibsonomy.org/bibtex/2fe1ed4dfc0e42165de44853564c7f6af/stefano}, keywords = {similarity semantic measures distributional} } @article{mohammadSubmittedDistributional, title = {Distributional measures as proxies for semantic relatedness}, author = {Saif Mohammad and Graeme Hirst}, year = {Submitted for publication}, url = {http://ftp.cs.toronto.edu/pub/gh/Mohammad+Hirst-2005.pdf}, biburl = {http://www.bibsonomy.org/bibtex/2fe1ed4dfc0e42165de44853564c7f6af/stumme}, keywords = {text relatedness measure similarity semantic measures distributional} } @book{harris68mathematical, title = {Mathematical Structures of Language}, address = {New York}, author = {Z. S. Harris}, publisher = {Wiley}, year = 1968, location = {New York}, biburl = {http://www.bibsonomy.org/bibtex/2f834ac9131f49062a5f1e362c35c8de8/stumme}, keywords = {measure similarity semantic measures hypothesis distributional} } @article{firth57synopsis, title = {A synopsis of linguistic theory 1930-55.}, address = {Oxford}, author = {J. R. Firth}, booktitle = {Studies in Linguistic Analysis (special volume of the Philological Society)}, pages = {1-32}, publisher = {The Philological Society}, volume = {1952-59}, year = 1957, abstract = {Reprinted in: Palmer, F. R. (ed.) (1968). Selected Papers of J. R. Firth 1952-59, pages 168-205. Longmans, London. }, biburl = {http://www.bibsonomy.org/bibtex/25e3d6c72cdd123a638f71886d78f3c1e/stumme}, keywords = {relatedness measure similarity theory measures distributional} } @misc{citeulike:407968, title = {A distributional limit law for the continued fraction digit sum}, author = {Marc Kesseb{\"o}hmer and Mehdi Slassi}, month = {Sep}, year = 2005, url = {http://arxiv.org/abs/math.NT/0509559}, id = {407968}, priority = {2}, eprint = {math.NT/0509559}, description = {citeulike}, abstract = {We consider the continued fraction digits as random variables measured with respect to Lebesgue measure. The logarithmically scaled and normalized fluctuation process of the digit sums converges strongly distributional to a random variable uniformly distributed on the unit interval. For this process normalized linearly we determine a large deviation asymptotic.}, biburl = {http://www.bibsonomy.org/bibtex/2b6d9b5296f35ff6d724291faf0812773/a_olympia}, keywords = {limit law fraction distributional continued} }