With the rapidly increasing pace at which Web content is evolving, particularly social media, preserving the Web and its evolution over time becomes an important challenge. Meaningful analysis of Web content lends itself to an entity-centric view to organise Web resources according to the information objects related to them. Therefore, the crucial challenge is to extract, detect and correlate entities from a vast number of heterogeneous Web resources where the nature and quality of the content may vary heavily. While a wealth of information extraction tools aid this process, we believe that, the consolidation of automatically extracted data has to be treated as an equally important step in order to ensure high quality and non-ambiguity of generated data. In this paper we present an approach which is based on an iterative cycle exploiting Web data for (1) targeted archiving/crawling of Web objects, (2) entity extraction, and detection, and (3) entity correlation. The long-term goal is to preserve Web content over time and allow its navigation and analysis based on well-formed structured RDF data about entities.
%0 Conference Paper
%1 conf/ercimdl/DietzeMDRPDS12
%A Dietze, Stefan
%A Maynard, Diana
%A Demidova, Elena
%A Risse, Thomas
%A Peters, Wim
%A Doka, Katerina
%A Stavrakas, Yannis
%B 2nd International Workshop on Semantic Digital Archives
%D 2012
%E Mitschick, Annett
%E Loizides, Fernando
%E Predoiu, Livia
%E Nürnberger, Andreas
%E Ross, Seamus
%I CEUR-WS.org
%K arcomem dbpedia enrichment freebase myown terence
%P 18-29
%T Entity Extraction and Consolidation for Social Web Content Preservation.
%U http://dblp.uni-trier.de/db/conf/ercimdl/sda2012.html#DietzeMDRPDS12
%V 912
%X With the rapidly increasing pace at which Web content is evolving, particularly social media, preserving the Web and its evolution over time becomes an important challenge. Meaningful analysis of Web content lends itself to an entity-centric view to organise Web resources according to the information objects related to them. Therefore, the crucial challenge is to extract, detect and correlate entities from a vast number of heterogeneous Web resources where the nature and quality of the content may vary heavily. While a wealth of information extraction tools aid this process, we believe that, the consolidation of automatically extracted data has to be treated as an equally important step in order to ensure high quality and non-ambiguity of generated data. In this paper we present an approach which is based on an iterative cycle exploiting Web data for (1) targeted archiving/crawling of Web objects, (2) entity extraction, and detection, and (3) entity correlation. The long-term goal is to preserve Web content over time and allow its navigation and analysis based on well-formed structured RDF data about entities.
@inproceedings{conf/ercimdl/DietzeMDRPDS12,
abstract = {With the rapidly increasing pace at which Web content is evolving, particularly social media, preserving the Web and its evolution over time becomes an important challenge. Meaningful analysis of Web content lends itself to an entity-centric view to organise Web resources according to the information objects related to them. Therefore, the crucial challenge is to extract, detect and correlate entities from a vast number of heterogeneous Web resources where the nature and quality of the content may vary heavily. While a wealth of information extraction tools aid this process, we believe that, the consolidation of automatically extracted data has to be treated as an equally important step in order to ensure high quality and non-ambiguity of generated data. In this paper we present an approach which is based on an iterative cycle exploiting Web data for (1) targeted archiving/crawling of Web objects, (2) entity extraction, and detection, and (3) entity correlation. The long-term goal is to preserve Web content over time and allow its navigation and analysis based on well-formed structured RDF data about entities. },
added-at = {2012-12-05T23:30:38.000+0100},
author = {Dietze, Stefan and Maynard, Diana and Demidova, Elena and Risse, Thomas and Peters, Wim and Doka, Katerina and Stavrakas, Yannis},
biburl = {https://www.bibsonomy.org/bibtex/2120e281fc31faf6b8a2d5a2ac6ed3ef7/demidova},
booktitle = {2nd International Workshop on Semantic Digital Archives},
crossref = {conf/ercimdl/2012sda},
editor = {Mitschick, Annett and Loizides, Fernando and Predoiu, Livia and Nürnberger, Andreas and Ross, Seamus},
ee = {http://ceur-ws.org/Vol-912/paper1.pdf},
interhash = {444586959c487f283fd657dbfa433f43},
intrahash = {120e281fc31faf6b8a2d5a2ac6ed3ef7},
keywords = {arcomem dbpedia enrichment freebase myown terence},
pages = {18-29},
publisher = {CEUR-WS.org},
series = {CEUR Workshop Proceedings},
timestamp = {2013-11-29T22:28:30.000+0100},
title = {Entity Extraction and Consolidation for Social Web Content Preservation.},
url = {http://dblp.uni-trier.de/db/conf/ercimdl/sda2012.html#DietzeMDRPDS12},
volume = 912,
year = 2012
}