Traditionally, information extraction from web tables has focused on small, more or less homogeneous corpora, often based on assumptions about the use of <table> tags. A multitude of different HTML implementations of web tables make these approaches difficult to scale. In this paper, we approach the problem of domain-independent information extraction from web tables by shifting our attention from the tree-based representation of webpages to a variation of the two-dimensional visual box model used by web browsers to display the information on the screen. The there by obtained topological and style information allows us to fill the gap created by missing domain-specific knowledge about content and table templates. We believe that, in a future step, this approach can become the basis for a new way of large-scale knowledge acquisition from the current "Visual Web.
%0 Conference Paper
%1 gatterbauer2007towards
%A Gatterbauer, Wolfgang
%A Bohunsky, Paul
%A Herzog, Marcus
%A Krüpl, Bernhard
%A Pollak, Bernhard
%B Proceedings of the 16th international conference on World Wide Web
%C New York, NY, USA
%D 2007
%I ACM
%K INWERO domain extraction independent information table web
%P 71--80
%R 10.1145/1242572.1242583
%T Towards domain-independent information extraction from web tables
%U http://doi.acm.org/10.1145/1242572.1242583
%X Traditionally, information extraction from web tables has focused on small, more or less homogeneous corpora, often based on assumptions about the use of <table> tags. A multitude of different HTML implementations of web tables make these approaches difficult to scale. In this paper, we approach the problem of domain-independent information extraction from web tables by shifting our attention from the tree-based representation of webpages to a variation of the two-dimensional visual box model used by web browsers to display the information on the screen. The there by obtained topological and style information allows us to fill the gap created by missing domain-specific knowledge about content and table templates. We believe that, in a future step, this approach can become the basis for a new way of large-scale knowledge acquisition from the current "Visual Web.
%@ 978-1-59593-654-7
@inproceedings{gatterbauer2007towards,
abstract = {Traditionally, information extraction from web tables has focused on small, more or less homogeneous corpora, often based on assumptions about the use of <table> tags. A multitude of different HTML implementations of web tables make these approaches difficult to scale. In this paper, we approach the problem of domain-independent information extraction from web tables by shifting our attention from the tree-based representation of webpages to a variation of the two-dimensional visual box model used by web browsers to display the information on the screen. The there by obtained topological and style information allows us to fill the gap created by missing domain-specific knowledge about content and table templates. We believe that, in a future step, this approach can become the basis for a new way of large-scale knowledge acquisition from the current "Visual Web.},
acmid = {1242583},
added-at = {2012-09-20T14:56:03.000+0200},
address = {New York, NY, USA},
author = {Gatterbauer, Wolfgang and Bohunsky, Paul and Herzog, Marcus and Kr\"{u}pl, Bernhard and Pollak, Bernhard},
biburl = {https://www.bibsonomy.org/bibtex/2a5be13781838c20be5ec3bc4ad72556b/porta},
booktitle = {Proceedings of the 16th international conference on World Wide Web},
doi = {10.1145/1242572.1242583},
groups = {public},
interhash = {61bd631988fe5a7495e3c54586d794f9},
intrahash = {a5be13781838c20be5ec3bc4ad72556b},
isbn = {978-1-59593-654-7},
keywords = {INWERO domain extraction independent information table web},
location = {Banff, Alberta, Canada},
numpages = {10},
pages = {71--80},
publisher = {ACM},
series = {WWW '07},
timestamp = {2013-03-01T23:26:16.000+0100},
title = {Towards domain-independent information extraction from web tables},
url = {http://doi.acm.org/10.1145/1242572.1242583},
username = {porta},
year = 2007
}