Web-page classification is much more difficult than pure-text classification due to a large variety of noisy information embedded in Web pages. In this paper, we propose a new Web-page classification algorithm based on Web summarization for improving the accuracy. We first give empirical evidence that ideal Web-page summaries generated by human editors can indeed improve the performance of Web-page classification algorithms. We then propose a new Web summarization-based classification algorithm and evaluate it along with several other state-of-the-art text summarization algorithms on the LookSmart Web directory. Experimental results show that our proposed summarization-based classification algorithm achieves an approximately 8.8% improvement as compared to pure-text-based classification algorithm. We further introduce an ensemble classifier using the improved summarization algorithm and show that it achieves about 12.9% improvement over pure-text based methods.
%0 Conference Paper
%1 Shen:2004:WCT:1008992.1009035
%A Shen, Dou
%A Chen, Zheng
%A Yang, Qiang
%A Zeng, Hua-Jun
%A Zhang, Benyu
%A Lu, Yuchang
%A Ma, Wei-Ying
%B Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval
%C New York, NY, USA
%D 2004
%I ACM
%K 2004 bachelor:2011:bachmann classification summaration webpage
%P 242--249
%R 10.1145/1008992.1009035
%T Web-page classification through summarization
%U http://doi.acm.org/10.1145/1008992.1009035
%X Web-page classification is much more difficult than pure-text classification due to a large variety of noisy information embedded in Web pages. In this paper, we propose a new Web-page classification algorithm based on Web summarization for improving the accuracy. We first give empirical evidence that ideal Web-page summaries generated by human editors can indeed improve the performance of Web-page classification algorithms. We then propose a new Web summarization-based classification algorithm and evaluate it along with several other state-of-the-art text summarization algorithms on the LookSmart Web directory. Experimental results show that our proposed summarization-based classification algorithm achieves an approximately 8.8% improvement as compared to pure-text-based classification algorithm. We further introduce an ensemble classifier using the improved summarization algorithm and show that it achieves about 12.9% improvement over pure-text based methods.
%@ 1-58113-881-4
@inproceedings{Shen:2004:WCT:1008992.1009035,
abstract = {Web-page classification is much more difficult than pure-text classification due to a large variety of noisy information embedded in Web pages. In this paper, we propose a new Web-page classification algorithm based on Web summarization for improving the accuracy. We first give empirical evidence that ideal Web-page summaries generated by human editors can indeed improve the performance of Web-page classification algorithms. We then propose a new Web summarization-based classification algorithm and evaluate it along with several other state-of-the-art text summarization algorithms on the LookSmart Web directory. Experimental results show that our proposed summarization-based classification algorithm achieves an approximately 8.8% improvement as compared to pure-text-based classification algorithm. We further introduce an ensemble classifier using the improved summarization algorithm and show that it achieves about 12.9% improvement over pure-text based methods.},
acmid = {1009035},
added-at = {2011-12-02T12:44:30.000+0100},
address = {New York, NY, USA},
author = {Shen, Dou and Chen, Zheng and Yang, Qiang and Zeng, Hua-Jun and Zhang, Benyu and Lu, Yuchang and Ma, Wei-Ying},
biburl = {https://www.bibsonomy.org/bibtex/2b83fca9d43e5afdea78b9791cc07890c/telekoma},
booktitle = {Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval},
description = {Web-page classification through summarization},
doi = {10.1145/1008992.1009035},
interhash = {328ff5b51cb573cd1d253f339892c029},
intrahash = {b83fca9d43e5afdea78b9791cc07890c},
isbn = {1-58113-881-4},
keywords = {2004 bachelor:2011:bachmann classification summaration webpage},
location = {Sheffield, United Kingdom},
numpages = {8},
pages = {242--249},
publisher = {ACM},
series = {SIGIR '04},
timestamp = {2012-01-04T14:56:46.000+0100},
title = {Web-page classification through summarization},
url = {http://doi.acm.org/10.1145/1008992.1009035},
year = 2004
}