The automated categorisation (or classification) of texts into topical categories has a long history,
dating back at least to the early ’60s. Until the late ’80s, the most effective approach to the
problem seemed to be that of manually building automatic classifiers by means of knowledgeengineering
techniques, i.e. manually defining a set of rules encoding expert knowledge on how
to classify documents under a given set of categories. In the ’90s, with the booming production
and availability of on-line documents, automated text categorisation has witnessed an increased
and renewed interest, prompted by which the machine learning paradigm to automatic classifier
construction has emerged and definitely superseded the knowledge-engineering approach. Within
the machine learning paradigm, a general inductive process (called the learner) automatically
builds a classifier (also called the rule, or the hypothesis) by “learning”, from a set of previously
classified documents, the characteristics of one or more categories. The advantages of this approach
are a very good effectiveness, a considerable savings in terms of expert manpower, and domain
independence. In this survey we look at the main approaches that have been taken towards
automatic text categorisation within the general machine learning paradigm. Issues pertaining to
document indexing, classifier construction, and classifier evaluation, will be discussed in detail. A
final section will be devoted to the techniques that have specifically been devised for an emerging
application such as the automatic classification of Web pages into “Yahoo!-like” hierarchically
structured sets of categories.
%0 Journal Article
%1 citeulike:478973
%A Sebastiani, Fabrizio
%D 2002
%J ACM Computing Surveys
%K learning machine da
%T Machine learning in automated text categorization
%U http://portal.acm.org/ft_gateway.cfm?id=505283&type=pdf&dl=ACM&dl=ACM&CFID=11111111&CFTOKEN=2222222
%X The automated categorisation (or classification) of texts into topical categories has a long history,
dating back at least to the early ’60s. Until the late ’80s, the most effective approach to the
problem seemed to be that of manually building automatic classifiers by means of knowledgeengineering
techniques, i.e. manually defining a set of rules encoding expert knowledge on how
to classify documents under a given set of categories. In the ’90s, with the booming production
and availability of on-line documents, automated text categorisation has witnessed an increased
and renewed interest, prompted by which the machine learning paradigm to automatic classifier
construction has emerged and definitely superseded the knowledge-engineering approach. Within
the machine learning paradigm, a general inductive process (called the learner) automatically
builds a classifier (also called the rule, or the hypothesis) by “learning”, from a set of previously
classified documents, the characteristics of one or more categories. The advantages of this approach
are a very good effectiveness, a considerable savings in terms of expert manpower, and domain
independence. In this survey we look at the main approaches that have been taken towards
automatic text categorisation within the general machine learning paradigm. Issues pertaining to
document indexing, classifier construction, and classifier evaluation, will be discussed in detail. A
final section will be devoted to the techniques that have specifically been devised for an emerging
application such as the automatic classification of Web pages into “Yahoo!-like” hierarchically
structured sets of categories.
@article{citeulike:478973,
abstract = {The automated categorisation (or classification) of texts into topical categories has a long history,
dating back at least to the early ’60s. Until the late ’80s, the most effective approach to the
problem seemed to be that of manually building automatic classifiers by means of knowledgeengineering
techniques, i.e. manually defining a set of rules encoding expert knowledge on how
to classify documents under a given set of categories. In the ’90s, with the booming production
and availability of on-line documents, automated text categorisation has witnessed an increased
and renewed interest, prompted by which the machine learning paradigm to automatic classifier
construction has emerged and definitely superseded the knowledge-engineering approach. Within
the machine learning paradigm, a general inductive process (called the learner) automatically
builds a classifier (also called the rule, or the hypothesis) by “learning”, from a set of previously
classified documents, the characteristics of one or more categories. The advantages of this approach
are a very good effectiveness, a considerable savings in terms of expert manpower, and domain
independence. In this survey we look at the main approaches that have been taken towards
automatic text categorisation within the general machine learning paradigm. Issues pertaining to
document indexing, classifier construction, and classifier evaluation, will be discussed in detail. A
final section will be devoted to the techniques that have specifically been devised for an emerging
application such as the automatic classification of Web pages into “Yahoo!-like” hierarchically
structured sets of categories.},
added-at = {2007-02-22T18:27:17.000+0100},
author = {Sebastiani, Fabrizio},
biburl = {https://www.bibsonomy.org/bibtex/2be7ac6440d1b65334811201b70c376eb/apo},
citeulike-article-id = {478973},
comment = {survey recommended by claudia niederee},
interhash = {d945d9218673dad37dc2a06cbf9e554c},
intrahash = {be7ac6440d1b65334811201b70c376eb},
journal = {ACM Computing Surveys},
keywords = {learning machine da},
priority = {2},
timestamp = {2007-02-22T18:27:18.000+0100},
title = {Machine learning in automated text categorization},
url = {http://portal.acm.org/ft_gateway.cfm?id=505283\&type=pdf\&dl=ACM\&dl=ACM\&CFID=11111111\&CFTOKEN=2222222},
year = 2002
}