Most research in speeding up text mining involves algorithmic improvements to induction algorithms, and yet for many large scale applications, such as classifying or indexing large document repositories, the time spent extracting word features from texts can itself greatly exceed the initial training time. This paper describes a fast method for text feature extraction that folds together Unicode conversion, forced lowercasing, word boundary detection, and string hash computation. We show empirically that our integer hash features result in classifiers with equivalent statistical performance to those built using string word features, but require far less computation and less memory.
Description
Extremely fast text feature extraction for classification and indexing
%0 Conference Paper
%1 Forman08fastExtraction
%A Forman, George
%A Kirshenbaum, Evan
%B CIKM '08: Proceeding of the 17th ACM conference on Information and knowledge management
%C New York, NY, USA
%D 2008
%I ACM
%K feature-vector hashing indexing machine-learning
%P 1221--1230
%R http://doi.acm.org/10.1145/1458082.1458243
%T Extremely fast text feature extraction for classification and indexing
%U http://portal.acm.org/citation.cfm?id=1458243
%X Most research in speeding up text mining involves algorithmic improvements to induction algorithms, and yet for many large scale applications, such as classifying or indexing large document repositories, the time spent extracting word features from texts can itself greatly exceed the initial training time. This paper describes a fast method for text feature extraction that folds together Unicode conversion, forced lowercasing, word boundary detection, and string hash computation. We show empirically that our integer hash features result in classifiers with equivalent statistical performance to those built using string word features, but require far less computation and less memory.
%@ 978-1-59593-991-3
@inproceedings{Forman08fastExtraction,
abstract = {Most research in speeding up text mining involves algorithmic improvements to induction algorithms, and yet for many large scale applications, such as classifying or indexing large document repositories, the time spent extracting word features from texts can itself greatly exceed the initial training time. This paper describes a fast method for text feature extraction that folds together Unicode conversion, forced lowercasing, word boundary detection, and string hash computation. We show empirically that our integer hash features result in classifiers with equivalent statistical performance to those built using string word features, but require far less computation and less memory.},
added-at = {2012-01-24T10:22:46.000+0100},
address = {New York, NY, USA},
author = {Forman, George and Kirshenbaum, Evan},
biburl = {https://www.bibsonomy.org/bibtex/21b6ec91f0fcab665d9dde00579bfdd52/gromgull},
booktitle = {CIKM '08: Proceeding of the 17th ACM conference on Information and knowledge management},
description = {Extremely fast text feature extraction for classification and indexing},
doi = {http://doi.acm.org/10.1145/1458082.1458243},
interhash = {e636f23fb943af90ef2b2c665e2a5d5f},
intrahash = {1b6ec91f0fcab665d9dde00579bfdd52},
isbn = {978-1-59593-991-3},
keywords = {feature-vector hashing indexing machine-learning},
location = {Napa Valley, California, USA},
pages = {1221--1230},
publisher = {ACM},
timestamp = {2012-01-24T10:22:46.000+0100},
title = {Extremely fast text feature extraction for classification and indexing},
url = {http://portal.acm.org/citation.cfm?id=1458243},
year = 2008
}