We compare several algorithms for identifying mirrored
hosts on the World Wide Web. The algorithms operate on
the basis of URL strings and linkage data: the type of
information easily available from web proxies and
crawlers. Identification of mirrored hosts can improve
web-based information retrieval in several ways: First,
by identifying mirrored hosts, search engines can avoid
storing and returning duplicate documents. Second,
several new information retrieval techniques for the
Web make inferences based on the explicit links among
hypertext documents -- mirroring perturbs their graph
model and degrades performance. Third, mirroring
information can be used to redirect users to alternate
mirror sites to compensate for various failures, and
can thus improve the performance of web browsers and
proxies.
%0 Journal Article
%1 BBD+00
%A Bharat, Krishna
%A Broder, Andrei Z.
%A Dean, Jeffrey
%A Henzinger, Monika Rauch
%D 2000
%J JASIS
%K searchengine www03 wwwbook wwwkap17
%N 12
%P 1114--1122
%T A comparison of techniques to find mirrored hosts of the WWW
%V 51
%X We compare several algorithms for identifying mirrored
hosts on the World Wide Web. The algorithms operate on
the basis of URL strings and linkage data: the type of
information easily available from web proxies and
crawlers. Identification of mirrored hosts can improve
web-based information retrieval in several ways: First,
by identifying mirrored hosts, search engines can avoid
storing and returning duplicate documents. Second,
several new information retrieval techniques for the
Web make inferences based on the explicit links among
hypertext documents -- mirroring perturbs their graph
model and degrades performance. Third, mirroring
information can be used to redirect users to alternate
mirror sites to compensate for various failures, and
can thus improve the performance of web browsers and
proxies.
@article{BBD+00,
abstract = {We compare several algorithms for identifying mirrored
hosts on the World Wide Web. The algorithms operate on
the basis of URL strings and linkage data: the type of
information easily available from web proxies and
crawlers. Identification of mirrored hosts can improve
web-based information retrieval in several ways: First,
by identifying mirrored hosts, search engines can avoid
storing and returning duplicate documents. Second,
several new information retrieval techniques for the
Web make inferences based on the explicit links among
hypertext documents -- mirroring perturbs their graph
model and degrades performance. Third, mirroring
information can be used to redirect users to alternate
mirror sites to compensate for various failures, and
can thus improve the performance of web browsers and
proxies.},
added-at = {2008-12-05T15:35:32.000+0100},
author = {Bharat, Krishna and Broder, Andrei Z. and Dean, Jeffrey and Henzinger, Monika Rauch},
biburl = {https://www.bibsonomy.org/bibtex/22e9fd73843945474196965d9151963ca/lysander07},
interhash = {76b58f6ca91fb7beccab1a9d0a7a3e4c},
intrahash = {2e9fd73843945474196965d9151963ca},
journal = {JASIS},
keywords = {searchengine www03 wwwbook wwwkap17},
number = 12,
pages = {1114--1122},
timestamp = {2009-01-27T15:24:50.000+0100},
title = {A comparison of techniques to find mirrored hosts of the {WWW}},
volume = 51,
year = 2000
}