@inproceedings{Myllymaki:2001, abstract = {We discuss the problem of Web data extraction and describe an XML-based methodology whose goal extends far beyond simple ``screen scraping.'' An ideal data extraction process is able to digest target Web databases that are visible only as HTML pages, and create a local, identical replica of those databases as a result. What is needed in this process is much more than a Web crawler and set of Web site wrappers. A comprehensive data extraction process needs to deal with such roadblocks such as session identifiers, HTML forms, and client-side JavaScript, and data integration problems such as incompatible datasets and vocabularies, and missing and conflicting data. Proper data extraction also requires a solid data validation and error recovery service to handle data extraction failures, which are unavoidable...}, added-at = {2007-12-14T02:44:25.000+0100}, author = {Myllymaki, Jussi}, biburl = {http://www.bibsonomy.org/bibtex/2527f1d41b4598dfc788a61c31c21e37e/diego_ma}, booktitle = {Proc. WWW10}, interhash = {113f373e3ed643c15857b21ee51fef27}, intrahash = {527f1d41b4598dfc788a61c31c21e37e}, keywords = {web_data_extraction}, timestamp = {2007-12-14T02:44:25.000+0100}, title = {Effective Web Data Extraction with Standard {XML} Technologies}, url = {http://citeseer.nj.nec.com/452335.html}, year = 2001 }