We have developed an efficient way to determine the syntactic similarity of files and have
applied it to every document on the World Wide Web. Using this mechanism, we built a
clustering of all the documents that are syntactically similar. Possible applications include a
"Lost and Found" service, filtering the results of Web searches, updating widely distributed
web-pages, and identifying violations of intellectual property rights.
%0 Journal Article
%1 shingling-broder
%A Broder, Andrei Z.
%A Glassman, Steven C.
%A Manasse, Mark S.
%A Zweig, Geoffrey
%B Selected papers from the sixth international conference on World Wide Web
%C Amsterdam, The Netherlands, The Netherlands
%D 1997
%I Elsevier Science Publishers B. V.
%J Comput. Netw. ISDN Syst.
%K qual text-mining web-mining
%P 1157--1166
%R 10.1016/S0169-7552(97)00031-7
%T Syntactic clustering of the Web
%U http://dx.doi.org/10.1016/S0169-7552(97)00031-7
%V 29
%X We have developed an efficient way to determine the syntactic similarity of files and have
applied it to every document on the World Wide Web. Using this mechanism, we built a
clustering of all the documents that are syntactically similar. Possible applications include a
"Lost and Found" service, filtering the results of Web searches, updating widely distributed
web-pages, and identifying violations of intellectual property rights.
@article{shingling-broder,
abstract = {{We have developed an efficient way to determine the syntactic similarity of files and have
applied it to every document on the World Wide Web. Using this mechanism, we built a
clustering of all the documents that are syntactically similar. Possible applications include a
"Lost and Found" service, filtering the results of Web searches, updating widely distributed
web-pages, and identifying violations of intellectual property rights.}},
added-at = {2011-09-28T23:59:32.000+0200},
address = {Amsterdam, The Netherlands, The Netherlands},
author = {Broder, Andrei Z. and Glassman, Steven C. and Manasse, Mark S. and Zweig, Geoffrey},
biburl = {https://www.bibsonomy.org/bibtex/22fd4640e1b2c96a587ac58a07b03c1be/dimitargn},
booktitle = {Selected papers from the sixth international conference on World Wide Web},
citeulike-article-id = {379517},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=283370},
citeulike-linkout-1 = {http://dx.doi.org/10.1016/S0169-7552(97)00031-7},
doi = {10.1016/S0169-7552(97)00031-7},
interhash = {424cdc36335873e4d8c0bed6e07e872e},
intrahash = {2fd4640e1b2c96a587ac58a07b03c1be},
issn = {0169-7552},
journal = {Comput. Netw. ISDN Syst.},
keywords = {qual text-mining web-mining},
location = {Santa Clara, California, United States},
month = sep,
pages = {1157--1166},
posted-at = {2011-09-09 19:47:36},
priority = {2},
publisher = {Elsevier Science Publishers B. V.},
timestamp = {2011-10-13T17:46:13.000+0200},
title = {{Syntactic clustering of the Web}},
url = {http://dx.doi.org/10.1016/S0169-7552(97)00031-7},
volume = 29,
year = 1997
}