In this paper we present a class of general methods for information
extraction and automatic categorization. These methods exploit the features of
data compression techniques in order to define a measure of syntactic
remoteness between pairs of sequences of characters (e.g. texts) based on their
relative informatic content. Using this elementary tool it is possible to
implement several algorithms to address problems of information retrieval in
very different domains. We address in particular several linguistic motivated
problems and we present results for automatic language recognition, authorship
attribution, context-based classification as well as automatic universal
classification. We also discuss in detail how specific features of data
compression techniques could be used to introduce the notion of ``dictionary''
of a given sequence and of ``Artificial Text'' and we show how these new tools
can be used for information retrieval purposes. We finally discuss the
relevance of our results in non-linguistic fields, i.e. whenever the
information is codified in generic sequences of characters.
%0 Generic
%1 citeulike:515
%A Baronchelli, Andrea
%A Loreto, Vittorio
%D 2004
%K classification, information-extraction, linguistics
%T Data Compression approach to Information Extraction and Classification
%U http://arxiv.org/abs/cond-mat/0403233
%X In this paper we present a class of general methods for information
extraction and automatic categorization. These methods exploit the features of
data compression techniques in order to define a measure of syntactic
remoteness between pairs of sequences of characters (e.g. texts) based on their
relative informatic content. Using this elementary tool it is possible to
implement several algorithms to address problems of information retrieval in
very different domains. We address in particular several linguistic motivated
problems and we present results for automatic language recognition, authorship
attribution, context-based classification as well as automatic universal
classification. We also discuss in detail how specific features of data
compression techniques could be used to introduce the notion of ``dictionary''
of a given sequence and of ``Artificial Text'' and we show how these new tools
can be used for information retrieval purposes. We finally discuss the
relevance of our results in non-linguistic fields, i.e. whenever the
information is codified in generic sequences of characters.
@electronic{citeulike:515,
abstract = {{In this paper we present a class of general methods for information
extraction and automatic categorization. These methods exploit the features of
data compression techniques in order to define a measure of syntactic
remoteness between pairs of sequences of characters (e.g. texts) based on their
relative informatic content. Using this elementary tool it is possible to
implement several algorithms to address problems of information retrieval in
very different domains. We address in particular several linguistic motivated
problems and we present results for automatic language recognition, authorship
attribution, context-based classification as well as automatic universal
classification. We also discuss in detail how specific features of data
compression techniques could be used to introduce the notion of ``dictionary''
of a given sequence and of ``Artificial Text'' and we show how these new tools
can be used for information retrieval purposes. We finally discuss the
relevance of our results in non-linguistic fields, i.e. whenever the
information is codified in generic sequences of characters.}},
added-at = {2010-12-17T18:47:41.000+0100},
archiveprefix = {arXiv},
author = {Baronchelli, Andrea and Loreto, Vittorio},
biburl = {https://www.bibsonomy.org/bibtex/27203eca675e281d002deb199eb53b4d5/mortimer_m8},
citeulike-article-id = {515},
citeulike-linkout-0 = {http://arxiv.org/abs/cond-mat/0403233},
citeulike-linkout-1 = {http://arxiv.org/pdf/cond-mat/0403233},
day = 9,
eprint = {cond-mat/0403233},
interhash = {56b620bafaebf3ebdd6352d5b3ea1286},
intrahash = {7203eca675e281d002deb199eb53b4d5},
keywords = {classification, information-extraction, linguistics},
month = {March},
posted-at = {2004-12-28 21:20:54},
priority = {2},
timestamp = {2010-12-20T11:11:25.000+0100},
title = {{Data Compression approach to Information Extraction and Classification}},
url = {http://arxiv.org/abs/cond-mat/0403233},
year = 2004
}