A. Tengli, Y. Yang, and N. Ma. Proceedings of the 20th international conference on Computational Linguistics, Stroudsburg, PA, USA, Association for Computational Linguistics, (2004)
DOI: 10.3115/1220355.1220497
Abstract
Information extraction from tables in web pages is a challenging problem due to the diverse nature of table formats and the vocabulary variants in attribute names. This paper presents a new approach to automated table extraction that exploits formatting cues in semi-structured HTML tables, learns lexical variants from training examples and uses a vector space model to deal with non-exact matches among labels. We conducted experiments with this method on a set of tables collected from 157 university web sites, and obtained the information extraction performance of 91.4% in the Fl-measure, showing the effectiveness of the combined use of structural table parsing and example-based label learning.
%0 Conference Paper
%1 tengli2004learning
%A Tengli, Ashwin
%A Yang, Yiming
%A Ma, Nian Li
%B Proceedings of the 20th international conference on Computational Linguistics
%C Stroudsburg, PA, USA
%D 2004
%I Association for Computational Linguistics
%K INWERO extraction information learning table web
%R 10.3115/1220355.1220497
%T Learning table extraction from examples
%U http://dx.doi.org/10.3115/1220355.1220497
%X Information extraction from tables in web pages is a challenging problem due to the diverse nature of table formats and the vocabulary variants in attribute names. This paper presents a new approach to automated table extraction that exploits formatting cues in semi-structured HTML tables, learns lexical variants from training examples and uses a vector space model to deal with non-exact matches among labels. We conducted experiments with this method on a set of tables collected from 157 university web sites, and obtained the information extraction performance of 91.4% in the Fl-measure, showing the effectiveness of the combined use of structural table parsing and example-based label learning.
@inproceedings{tengli2004learning,
abstract = {Information extraction from tables in web pages is a challenging problem due to the diverse nature of table formats and the vocabulary variants in attribute names. This paper presents a new approach to automated table extraction that exploits formatting cues in semi-structured HTML tables, learns lexical variants from training examples and uses a vector space model to deal with non-exact matches among labels. We conducted experiments with this method on a set of tables collected from 157 university web sites, and obtained the information extraction performance of 91.4% in the Fl-measure, showing the effectiveness of the combined use of structural table parsing and example-based label learning.},
acmid = {1220497},
added-at = {2012-09-20T14:54:59.000+0200},
address = {Stroudsburg, PA, USA},
articleno = {987},
author = {Tengli, Ashwin and Yang, Yiming and Ma, Nian Li},
biburl = {https://www.bibsonomy.org/bibtex/29b4568ebe3e9995185a37e5de2846053/porta},
booktitle = {Proceedings of the 20th international conference on Computational Linguistics},
doi = {10.3115/1220355.1220497},
groups = {public},
interhash = {c4d46d3f1fed4c9d8830181b9c02d73c},
intrahash = {9b4568ebe3e9995185a37e5de2846053},
keywords = {INWERO extraction information learning table web},
location = {Geneva, Switzerland},
publisher = {Association for Computational Linguistics},
series = {COLING '04},
timestamp = {2013-03-01T23:28:03.000+0100},
title = {Learning table extraction from examples},
url = {http://dx.doi.org/10.3115/1220355.1220497},
username = {porta},
year = 2004
}