Researchers in the Digital Humanities and journalists need to monitor, collect and analyze fresh online content regarding current events such as the Ebola outbreak or the Ukraine crisis on demand. However, existing focused crawling approaches only consider topical aspects while ignoring temporal aspects and therefore cannot achieve thematically coherent and fresh Web collections. Especially Social Media provide a rich source of fresh content, which is not used by state-of-the-art focused crawlers. In this paper we address the issues of enabling the collection of fresh and relevant Web and Social Web content for a topic of interest through seamless integration of Web and Social Media in a novel integrated focused crawler. The crawler collects Web and Social Media content in a single system and exploits the stream of fresh Social Media content for guiding the crawler.
%0 Conference Paper
%1 Gossen:2015:IIF:2756406.2756925
%A Gossen, Gerhard
%A Demidova, Elena
%A Risse, Thomas
%B Proceedings of the 15th ACM/IEEE-CS Joint Conference on Digital Libraries
%C New York, NY, USA
%D 2015
%I ACM
%K alexandria sobigdata
%P 75--84
%R 10.1145/2756406.2756925
%T iCrawl: Improving the Freshness of Web Collections by Integrating Social Web and Focused Web Crawling
%U https://arxiv.org/abs/1612.06202
%X Researchers in the Digital Humanities and journalists need to monitor, collect and analyze fresh online content regarding current events such as the Ebola outbreak or the Ukraine crisis on demand. However, existing focused crawling approaches only consider topical aspects while ignoring temporal aspects and therefore cannot achieve thematically coherent and fresh Web collections. Especially Social Media provide a rich source of fresh content, which is not used by state-of-the-art focused crawlers. In this paper we address the issues of enabling the collection of fresh and relevant Web and Social Web content for a topic of interest through seamless integration of Web and Social Media in a novel integrated focused crawler. The crawler collects Web and Social Media content in a single system and exploits the stream of fresh Social Media content for guiding the crawler.
%@ 978-1-4503-3594-2
@inproceedings{Gossen:2015:IIF:2756406.2756925,
abstract = {Researchers in the Digital Humanities and journalists need to monitor, collect and analyze fresh online content regarding current events such as the Ebola outbreak or the Ukraine crisis on demand. However, existing focused crawling approaches only consider topical aspects while ignoring temporal aspects and therefore cannot achieve thematically coherent and fresh Web collections. Especially Social Media provide a rich source of fresh content, which is not used by state-of-the-art focused crawlers. In this paper we address the issues of enabling the collection of fresh and relevant Web and Social Web content for a topic of interest through seamless integration of Web and Social Media in a novel integrated focused crawler. The crawler collects Web and Social Media content in a single system and exploits the stream of fresh Social Media content for guiding the crawler.},
acmid = {2756925},
added-at = {2016-08-24T11:11:45.000+0200},
address = {New York, NY, USA},
author = {Gossen, Gerhard and Demidova, Elena and Risse, Thomas},
biburl = {https://www.bibsonomy.org/bibtex/231ba5dc80b909408f3e72f56cbc34620/alexandriaproj},
booktitle = {Proceedings of the 15th ACM/IEEE-CS Joint Conference on Digital Libraries},
doi = {10.1145/2756406.2756925},
interhash = {fd8aeee8dc84ce1e3c09e7aad610c014},
intrahash = {31ba5dc80b909408f3e72f56cbc34620},
isbn = {978-1-4503-3594-2},
keywords = {alexandria sobigdata},
location = {Knoxville, Tennessee, USA},
numpages = {10},
pages = {75--84},
publisher = {ACM},
series = {JCDL '15},
timestamp = {2017-04-10T12:01:39.000+0200},
title = {iCrawl: Improving the Freshness of Web Collections by Integrating Social Web and Focused Web Crawling},
url = {https://arxiv.org/abs/1612.06202},
year = 2015
}