| Authors: |
Yiming Yang
and S. Slattery
and Rayid Ghani
|
| URL: |
http://www.cs.cmu.edu/~rayid/mypapers/hypertext-jiis01.ps |
| Tags: |
Classification
WWW
|
| Abstract: |
Hypertext poses new research challenges for text classification. Hyperlinks,
HTML tags, category labels distributed over linked documents, and
meta data extracted from related Web sites all provide rich information
for classifying hypertext documents. How to appropriately represent
that information and automatically learn statistical patterns for
solving hypertext classification problems is an open question. This
paper seeks a principled approach to providing the answers. Specifically,
we define five hypertext regularities which may (or may not) hold
in a particular application domain, and whose presence (or absence)
may significantly influence the optimal design of a classifier. Using
three hypertext datasets and three well-known learning algorithms
(Naive Bayes, Nearest Neighbor, and First Order Inductive Learner),
we examine these regularities in different domains, and compare alternative
ways to exploit them. Our results show that the identification of
hypertext regularities in the data and the selection of appropriate
representations for hypertext in particular domains are crucial,
but seldom obvious, in real-world problems. We find that adding the
words in the linked neighborhood to the page having those links (both
inlinks and outlinks) were helpful for all our classifiers on one
data set, but more harmful than helpful for two out of the three
classifiers on the remaining datasets. We also observed that extracting
meta data from related Web sites was extremely useful for improving
classification accuracy in some of those domains. Finally, the relative
performance of the classifiers being tested provided insights into
their strengths and limitations for solving classification problems
involving diverse and often noisy Web pages. |
@article{Yang2002,
title = {A study of approaches to hypertext categorization},
author = {Yiming Yang and S. Slattery and Rayid Ghani},
journal = {Journal of Intelligent Information Systems},
number = {2-3},
pages = {219--241},
url = {http://www.cs.cmu.edu/~rayid/mypapers/hypertext-jiis01.ps},
volume = {18},
year = {2002},
abstract = {Hypertext poses new research challenges for text classification. Hyperlinks,
HTML tags, category labels distributed over linked documents, and
meta data extracted from related Web sites all provide rich information
for classifying hypertext documents. How to appropriately represent
that information and automatically learn statistical patterns for
solving hypertext classification problems is an open question. This
paper seeks a principled approach to providing the answers. Specifically,
we define five hypertext regularities which may (or may not) hold
in a particular application domain, and whose presence (or absence)
may significantly influence the optimal design of a classifier. Using
three hypertext datasets and three well-known learning algorithms
(Naive Bayes, Nearest Neighbor, and First Order Inductive Learner),
we examine these regularities in different domains, and compare alternative
ways to exploit them. Our results show that the identification of
hypertext regularities in the data and the selection of appropriate
representations for hypertext in particular domains are crucial,
but seldom obvious, in real-world problems. We find that adding the
words in the linked neighborhood to the page having those links (both
inlinks and outlinks) were helpful for all our classifiers on one
data set, but more harmful than helpful for two out of the three
classifiers on the remaining datasets. We also observed that extracting
meta data from related Web sites was extremely useful for improving
classification accuracy in some of those domains. Finally, the relative
performance of the classifiers being tested provided insights into
their strengths and limitations for solving classification problems
involving diverse and often noisy Web pages.},
timestamp = {2007.05.18}, owner = {Marco},
keywords = {Classification WWW }
}