Given a large collection of sparse vector data in a high dimensional space, we investigate the problem of finding all pairs of vectors whose similarity score (as determined by a function such as cosine distance) is above a given threshold. We propose a simple algorithm based on novel indexing and optimization strategies that solves this problem without relying on approximation methods or extensive parameter tuning. We show the approach efficiently handles a variety of datasets across a wide setting of similarity thresholds, with large speedups over previous state-of-the-art approaches.
Description
BibSonomy :: bibtex :: Scaling up all pairs similarity search
%0 Conference Paper
%1 citeulike:1288910
%A Bayardo, Roberto J.
%A Ma, Yiming
%A Srikant, Ramakrishnan
%B WWW '07: Proceedings of the 16th international conference on World Wide Web
%C New York, NY, USA
%D 2007
%I ACM
%K cosinus similarity google
%P 131--140
%R 10.1145/1242572.1242591
%T Scaling up all pairs similarity search
%U http://dx.doi.org/10.1145/1242572.1242591
%X Given a large collection of sparse vector data in a high dimensional space, we investigate the problem of finding all pairs of vectors whose similarity score (as determined by a function such as cosine distance) is above a given threshold. We propose a simple algorithm based on novel indexing and optimization strategies that solves this problem without relying on approximation methods or extensive parameter tuning. We show the approach efficiently handles a variety of datasets across a wide setting of similarity thresholds, with large speedups over previous state-of-the-art approaches.
%@ 978-1-59593-654-7
@inproceedings{citeulike:1288910,
abstract = {Given a large collection of sparse vector data in a high dimensional space, we investigate the problem of finding all pairs of vectors whose similarity score (as determined by a function such as cosine distance) is above a given threshold. We propose a simple algorithm based on novel indexing and optimization strategies that solves this problem without relying on approximation methods or extensive parameter tuning. We show the approach efficiently handles a variety of datasets across a wide setting of similarity thresholds, with large speedups over previous state-of-the-art approaches.},
added-at = {2010-01-25T13:46:25.000+0100},
address = {New York, NY, USA},
author = {Bayardo, Roberto J. and Ma, Yiming and Srikant, Ramakrishnan},
biburl = {https://www.bibsonomy.org/bibtex/22509f7c5214e727109440f052c8d94e9/cscholz},
booktitle = {WWW '07: Proceedings of the 16th international conference on World Wide Web},
citeulike-article-id = {1288910},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1242572.1242591},
citeulike-linkout-1 = {http://dx.doi.org/10.1145/1242572.1242591},
description = {BibSonomy :: bibtex :: Scaling up all pairs similarity search},
doi = {10.1145/1242572.1242591},
interhash = {e3dff704c5de1eab3a6cd2fda9cfdf53},
intrahash = {2509f7c5214e727109440f052c8d94e9},
isbn = {978-1-59593-654-7},
keywords = {cosinus similarity google},
location = {Banff, Alberta, Canada},
pages = {131--140},
posted-at = {2007-05-10 22:53:53},
priority = {5},
publisher = {ACM},
timestamp = {2010-10-15T09:33:46.000+0200},
title = {Scaling up all pairs similarity search},
url = {http://dx.doi.org/10.1145/1242572.1242591},
year = 2007
}