Massive textual data management and mining usually rely on automatic text classification technology. Term weighting is a basic problem in text classification and directly affects the classification accuracy. Since the traditional TF-IDF (term frequency & inverse document frequency) is not fully effective for text classification, various alternatives have been proposed by researchers. In this paper we make comparative studies on different term weighting schemes and propose a new term weighting scheme, TF-IGM (term frequency & inverse gravity moment), as well as its variants. TF-IGM incorporates a new statistical model to precisely measure the class distinguishing power of a term. Particularly, it makes full use of the fine-grained term distribution across different classes of text. The effectiveness of TF-IGM is validated by extensive experiments of text classification using SVM (support vector machine) and kNN (k nearest neighbors) classifiers on three commonly used corpora. The experimental results show that TF-IGM outperforms the famous TF-IDF and the state-of-the-art supervised term weighting schemes. In addition, some new findings different from previous studies are obtained and analyzed in depth in the paper.
ScienceDirect Full Text PDF:C\:\\Users\\klaus\\Zotero\\storage\\2LJTR86G\\Chen et al. - 2016 - Turning from TF-IDF to TF-IGM for term weighting i.pdf:application/pdf;ScienceDirect Snapshot:C\:\\Users\\klaus\\Zotero\\storage\\JNSJGKLV\\S0957417416304870.html:text/html
%0 Journal Article
%1 chen_turning_2016
%A Chen, Kewen
%A Zhang, Zuping
%A Long, Jun
%A Zhang, Hao
%D 2016
%J Expert Systems with Applications
%K termgewichtung
%P 245--260
%R 10.1016/j.eswa.2016.09.009
%T Turning from TF-IDF to TF-IGM for term weighting in text classification
%U http://www.sciencedirect.com/science/article/pii/S0957417416304870
%V 66
%X Massive textual data management and mining usually rely on automatic text classification technology. Term weighting is a basic problem in text classification and directly affects the classification accuracy. Since the traditional TF-IDF (term frequency & inverse document frequency) is not fully effective for text classification, various alternatives have been proposed by researchers. In this paper we make comparative studies on different term weighting schemes and propose a new term weighting scheme, TF-IGM (term frequency & inverse gravity moment), as well as its variants. TF-IGM incorporates a new statistical model to precisely measure the class distinguishing power of a term. Particularly, it makes full use of the fine-grained term distribution across different classes of text. The effectiveness of TF-IGM is validated by extensive experiments of text classification using SVM (support vector machine) and kNN (k nearest neighbors) classifiers on three commonly used corpora. The experimental results show that TF-IGM outperforms the famous TF-IDF and the state-of-the-art supervised term weighting schemes. In addition, some new findings different from previous studies are obtained and analyzed in depth in the paper.
@article{chen_turning_2016,
abstract = {Massive textual data management and mining usually rely on automatic text classification technology. Term weighting is a basic problem in text classification and directly affects the classification accuracy. Since the traditional TF-IDF (term frequency \& inverse document frequency) is not fully effective for text classification, various alternatives have been proposed by researchers. In this paper we make comparative studies on different term weighting schemes and propose a new term weighting scheme, TF-IGM (term frequency \& inverse gravity moment), as well as its variants. TF-IGM incorporates a new statistical model to precisely measure the class distinguishing power of a term. Particularly, it makes full use of the fine-grained term distribution across different classes of text. The effectiveness of TF-IGM is validated by extensive experiments of text classification using SVM (support vector machine) and kNN (k nearest neighbors) classifiers on three commonly used corpora. The experimental results show that TF-IGM outperforms the famous TF-IDF and the state-of-the-art supervised term weighting schemes. In addition, some new findings different from previous studies are obtained and analyzed in depth in the paper.},
added-at = {2019-05-21T15:25:32.000+0200},
author = {Chen, Kewen and Zhang, Zuping and Long, Jun and Zhang, Hao},
biburl = {https://www.bibsonomy.org/bibtex/221cda90090f81809f8bb489884cd968b/lepsky},
doi = {10.1016/j.eswa.2016.09.009},
file = {ScienceDirect Full Text PDF:C\:\\Users\\klaus\\Zotero\\storage\\2LJTR86G\\Chen et al. - 2016 - Turning from TF-IDF to TF-IGM for term weighting i.pdf:application/pdf;ScienceDirect Snapshot:C\:\\Users\\klaus\\Zotero\\storage\\JNSJGKLV\\S0957417416304870.html:text/html},
interhash = {433575b2e039f11b35d916850d12ef00},
intrahash = {21cda90090f81809f8bb489884cd968b},
issn = {0957-4174},
journal = {Expert Systems with Applications},
keywords = {termgewichtung},
month = dec,
pages = {245--260},
timestamp = {2019-05-21T15:25:32.000+0200},
title = {Turning from {TF}-{IDF} to {TF}-{IGM} for term weighting in text classification},
url = {http://www.sciencedirect.com/science/article/pii/S0957417416304870},
urldate = {2019-05-20},
volume = 66,
year = 2016
}