Near-duplicate web documents are abundant. Two such documents differ from each other in a very small portion that displays advertisements, for example. Such differences are irrelevant for web search. So the quality of a web crawler increases if it can assess whether a newly crawled web page is a near-duplicate of a previously crawled web page or not. In the course of developing a near-duplicate detection system for a multi-billion page repository, we make two research contributions. First, we demonstrate that Charikar's fingerprinting technique is appropriate for this goal. Second, we present an algorithmic technique for identifying existing f-bit fingerprints that differ from a given fingerprint in at most k bit-positions, for small k. Our technique is useful for both online queries (single fingerprints) and all batch queries (multiple fingerprints). Experimental evaluation over real data confirms the practicality of our design.
%0 Conference Paper
%1 1242592
%A Manku, Gurmeet Singh
%A Jain, Arvind
%A Sarma, Anish Das
%B WWW '07: Proceedings of the 16th international conference on World Wide Web
%C New York, NY, USA
%D 2007
%I ACM
%K crawler searchengine searching sw0809-02 uri
%P 141--150
%R http://doi.acm.org/10.1145/1242572.1242592
%T Detecting near-duplicates for web crawling
%U http://portal.acm.org/citation.cfm?id=1242592#
%X Near-duplicate web documents are abundant. Two such documents differ from each other in a very small portion that displays advertisements, for example. Such differences are irrelevant for web search. So the quality of a web crawler increases if it can assess whether a newly crawled web page is a near-duplicate of a previously crawled web page or not. In the course of developing a near-duplicate detection system for a multi-billion page repository, we make two research contributions. First, we demonstrate that Charikar's fingerprinting technique is appropriate for this goal. Second, we present an algorithmic technique for identifying existing f-bit fingerprints that differ from a given fingerprint in at most k bit-positions, for small k. Our technique is useful for both online queries (single fingerprints) and all batch queries (multiple fingerprints). Experimental evaluation over real data confirms the practicality of our design.
%@ 978-1-59593-654-7
@inproceedings{1242592,
abstract = {Near-duplicate web documents are abundant. Two such documents differ from each other in a very small portion that displays advertisements, for example. Such differences are irrelevant for web search. So the quality of a web crawler increases if it can assess whether a newly crawled web page is a near-duplicate of a previously crawled web page or not. In the course of developing a near-duplicate detection system for a multi-billion page repository, we make two research contributions. First, we demonstrate that Charikar's fingerprinting technique is appropriate for this goal. Second, we present an algorithmic technique for identifying existing f-bit fingerprints that differ from a given fingerprint in at most k bit-positions, for small k. Our technique is useful for both online queries (single fingerprints) and all batch queries (multiple fingerprints). Experimental evaluation over real data confirms the practicality of our design.},
added-at = {2008-11-27T09:28:51.000+0100},
address = {New York, NY, USA},
author = {Manku, Gurmeet Singh and Jain, Arvind and Sarma, Anish Das},
biburl = {https://www.bibsonomy.org/bibtex/256ef299eac200b1c36a0300a8e619451/lysander07},
booktitle = {WWW '07: Proceedings of the 16th international conference on World Wide Web},
description = {DAS IST EIN KOMMENTAR},
doi = {http://doi.acm.org/10.1145/1242572.1242592},
interhash = {610b65c975aecd87e4dccc0ca0a2cbee},
intrahash = {56ef299eac200b1c36a0300a8e619451},
isbn = {978-1-59593-654-7},
keywords = {crawler searchengine searching sw0809-02 uri},
location = {Banff, Alberta, Canada},
pages = {141--150},
publisher = {ACM},
timestamp = {2009-01-27T15:24:50.000+0100},
title = {Detecting near-duplicates for web crawling},
url = {http://portal.acm.org/citation.cfm?id=1242592#},
year = 2007
}