We propose a hierarchical approach to document categorization that requires no pre-configuration and maps the semantic document space to a predefined taxonomy. The utilization of search engines to train a hierarchical classifier makes our approach more flexible than existing solutions which rely on (human) labeled data and are bound to a specific domain. We show that the structural information given by the taxonomy allows for a context aware construction of search queries and leads to higher tagging accuracy. We test our approach on different benchmark datasets and evaluate its performance on the single- and multi-tag assignment tasks. The experimental results show that our solution is as accurate as supervised classifiers for web page classification and still performs well when categorizing domain specific documents.
%0 Conference Paper
%1 1331742
%A Wetzker, Robert
%A Alpcan, Tansu
%A Bauckhage, Christian
%A Umbrath, Winfried
%A Albayrak, Sahin
%B WI '07: Proceedings of the IEEE/WIC/ACM International Conference on Web Intelligence
%C Washington, DC, USA
%D 2007
%I IEEE Computer Society
%K 2007 WI paper reuters textclassification
%P 482--486
%R http://dx.doi.org/10.1109/WI.2007.21
%T An unsupervised hierarchical approach to document categorization
%U http://portal.acm.org/citation.cfm?id=1331740.1331742&coll=Portal&dl=GUIDE&CFID=79977425&CFTOKEN=76456833#
%X We propose a hierarchical approach to document categorization that requires no pre-configuration and maps the semantic document space to a predefined taxonomy. The utilization of search engines to train a hierarchical classifier makes our approach more flexible than existing solutions which rely on (human) labeled data and are bound to a specific domain. We show that the structural information given by the taxonomy allows for a context aware construction of search queries and leads to higher tagging accuracy. We test our approach on different benchmark datasets and evaluate its performance on the single- and multi-tag assignment tasks. The experimental results show that our solution is as accurate as supervised classifiers for web page classification and still performs well when categorizing domain specific documents.
%@ 0-7695-3026-5
@inproceedings{1331742,
abstract = {We propose a hierarchical approach to document categorization that requires no pre-configuration and maps the semantic document space to a predefined taxonomy. The utilization of search engines to train a hierarchical classifier makes our approach more flexible than existing solutions which rely on (human) labeled data and are bound to a specific domain. We show that the structural information given by the taxonomy allows for a context aware construction of search queries and leads to higher tagging accuracy. We test our approach on different benchmark datasets and evaluate its performance on the single- and multi-tag assignment tasks. The experimental results show that our solution is as accurate as supervised classifiers for web page classification and still performs well when categorizing domain specific documents.},
added-at = {2008-07-29T07:53:46.000+0200},
address = {Washington, DC, USA},
author = {Wetzker, Robert and Alpcan, Tansu and Bauckhage, Christian and Umbrath, Winfried and Albayrak, Sahin},
biburl = {https://www.bibsonomy.org/bibtex/27b54735e4c3209fcb45b6f1bd05b0db0/rwdai},
booktitle = {WI '07: Proceedings of the IEEE/WIC/ACM International Conference on Web Intelligence},
doi = {http://dx.doi.org/10.1109/WI.2007.21},
interhash = {c36c574db6b294a8548535da6c63e516},
intrahash = {7b54735e4c3209fcb45b6f1bd05b0db0},
isbn = {0-7695-3026-5},
keywords = {2007 WI paper reuters textclassification},
pages = {482--486},
publisher = {IEEE Computer Society},
timestamp = {2008-07-29T08:12:10.000+0200},
title = {An unsupervised hierarchical approach to document categorization},
url = {http://portal.acm.org/citation.cfm?id=1331740.1331742&coll=Portal&dl=GUIDE&CFID=79977425&CFTOKEN=76456833#},
year = 2007
}