We report our experience in implementing UbiCrawler, a
scalable distributed web crawler, using the Java
programming language. The main features of UbiCrawler
are platform independence, fault tolerance, a very
effective assignment function for partitioning the
domain to crawl, and more in general the complete
decentralization of every task. The necessity of
handling very large sets of data has highlighted some
limitation of the Java APIs, which prompted the authors
to partially reimplement them.
%0 Journal Article
%1 Boldi:2004:USF
%A Boldi, Paolo
%A Codenotti, Bruno
%A Santini, Massimo
%A Vigna, Sebastiano
%D 2004
%J Soft\-ware-Prac\-tice and Experience
%K searchengine www03 wwwbook wwwkap17
%N 8
%P 711--726
%T UbiCrawler: a scalable fully distributed Web
crawler
%V 34
%X We report our experience in implementing UbiCrawler, a
scalable distributed web crawler, using the Java
programming language. The main features of UbiCrawler
are platform independence, fault tolerance, a very
effective assignment function for partitioning the
domain to crawl, and more in general the complete
decentralization of every task. The necessity of
handling very large sets of data has highlighted some
limitation of the Java APIs, which prompted the authors
to partially reimplement them.
@article{Boldi:2004:USF,
abstract = {We report our experience in implementing UbiCrawler, a
scalable distributed web crawler, using the Java
programming language. The main features of UbiCrawler
are platform independence, fault tolerance, a very
effective assignment function for partitioning the
domain to crawl, and more in general the complete
decentralization of every task. The necessity of
handling very large sets of data has highlighted some
limitation of the Java APIs, which prompted the authors
to partially reimplement them.},
added-at = {2008-12-05T15:57:18.000+0100},
author = {Boldi, Paolo and Codenotti, Bruno and Santini, Massimo and Vigna, Sebastiano},
biburl = {https://www.bibsonomy.org/bibtex/2e6d82d0c391ad9b3fb9a86039655bece/lysander07},
day = 10,
interhash = {c16df1c30407241815f78988acf72bfe},
intrahash = {e6d82d0c391ad9b3fb9a86039655bece},
journal = {Soft\-ware-Prac\-tice and Experience},
keywords = {searchengine www03 wwwbook wwwkap17},
month = {July},
number = 8,
pages = {711--726},
timestamp = {2009-01-27T15:24:50.000+0100},
title = {{UbiCrawler}: a scalable fully distributed {Web}
crawler},
volume = 34,
year = 2004
}