The basic approach in text categorization is to represent documents by single words. However, often other features are utilized to achieve better classification results. In this paper, our attention is focused on bigrams and 2-itemsets. We compare the performance improvement in terms of classification accuracy when these features are used to extend the single words-based document representation on two standard text corpora: Reuters-21578 and 20 Newsgroups. For this comparison we use the multinomial Naive Bayes classifier and five different feature selection approaches. Algorithms for bigrams and 2-itemsets discovery are presented as well. Our results show a statistically significant improvement when bigrams and also 2-itemsets are incorporated. However, in the case of 2-itemsets it is important to use an appropriate feature selection method. On the other hand, even when a simple feature selection approach is applied to discover bigrams the classification accuracy improves. The conclusion is that, in our case, it is not very effective to extend document representation with 2-itemsets because bigrams achieve better results and discovering them is less resource-consuming.
%0 Conference Paper
%1 Tesar2006
%A Tesar, Roman
%A Strnad, Vaclav
%A Jezek, Karel
%A Poesio, Massimo
%B DocEng '06: Proceedings of the 2006 ACM symposium on Document engineering
%C New York, NY, USA
%D 2006
%I ACM
%K bigram textcateg
%P 138--146
%R http://doi.acm.org/10.1145/1166160.1166197
%T Extending the single words-based document model: a comparison of bigrams and 2-itemsets
%U http://portal.acm.org/citation.cfm?id=1166160.1166197
%X The basic approach in text categorization is to represent documents by single words. However, often other features are utilized to achieve better classification results. In this paper, our attention is focused on bigrams and 2-itemsets. We compare the performance improvement in terms of classification accuracy when these features are used to extend the single words-based document representation on two standard text corpora: Reuters-21578 and 20 Newsgroups. For this comparison we use the multinomial Naive Bayes classifier and five different feature selection approaches. Algorithms for bigrams and 2-itemsets discovery are presented as well. Our results show a statistically significant improvement when bigrams and also 2-itemsets are incorporated. However, in the case of 2-itemsets it is important to use an appropriate feature selection method. On the other hand, even when a simple feature selection approach is applied to discover bigrams the classification accuracy improves. The conclusion is that, in our case, it is not very effective to extend document representation with 2-itemsets because bigrams achieve better results and discovering them is less resource-consuming.
%@ 1-59593-515-0
@inproceedings{Tesar2006,
abstract = {The basic approach in text categorization is to represent documents by single words. However, often other features are utilized to achieve better classification results. In this paper, our attention is focused on bigrams and 2-itemsets. We compare the performance improvement in terms of classification accuracy when these features are used to extend the single words-based document representation on two standard text corpora: Reuters-21578 and 20 Newsgroups. For this comparison we use the multinomial Naive Bayes classifier and five different feature selection approaches. Algorithms for bigrams and 2-itemsets discovery are presented as well. Our results show a statistically significant improvement when bigrams and also 2-itemsets are incorporated. However, in the case of 2-itemsets it is important to use an appropriate feature selection method. On the other hand, even when a simple feature selection approach is applied to discover bigrams the classification accuracy improves. The conclusion is that, in our case, it is not very effective to extend document representation with 2-itemsets because bigrams achieve better results and discovering them is less resource-consuming.},
added-at = {2009-05-14T07:56:06.000+0200},
address = {New York, NY, USA},
author = {Tesar, Roman and Strnad, Vaclav and Jezek, Karel and Poesio, Massimo},
biburl = {https://www.bibsonomy.org/bibtex/248079e2741af01306bc91583f028be30/jamesh},
booktitle = {DocEng '06: Proceedings of the 2006 ACM symposium on Document engineering},
description = {Extending the single words-based document model},
doi = {http://doi.acm.org/10.1145/1166160.1166197},
interhash = {ee2cf973053b39bb099ecccdda0e1385},
intrahash = {48079e2741af01306bc91583f028be30},
isbn = {1-59593-515-0},
keywords = {bigram textcateg},
location = {Amsterdam, The Netherlands},
pages = {138--146},
publisher = {ACM},
timestamp = {2009-05-14T07:56:06.000+0200},
title = {Extending the single words-based document model: a comparison of bigrams and 2-itemsets},
url = {http://portal.acm.org/citation.cfm?id=1166160.1166197},
year = 2006
}