Classification of search queries is a complex and computationally challenging task. Typically, search queries are short, reveal very few features per single query and are therefore a weak source for traditional machine learning. In this paper, we present a method that combines limited manual labeling, computational linguistics and information retrieval to classify a large collection of web search queries. A short set of manually chosen terms that are known a priori to be of interest to a particular class is used to cull a small number of actual queries from a commercial search engine log. These queries are then submitted to a commercial search engine and the returned search results are used to find more class related terms. We examine classification proficiency of the proposed method on a large web search engine query log and show that up to 48% of the unlabeled set could be classified using this method. We discuss results of this research and its implications on the advancement of short text classification.
Beschreibung
Using Web Search Logs to Identify Query Classification Terms
%0 Conference Paper
%1 Taksa2007
%A Taksa, Isak
%A Zelikovitz, Sarah
%A Spink, Amanda
%B ITNG '07: Proceedings of the International Conference on Information Technology
%C Washington, DC, USA
%D 2007
%I IEEE Computer Society
%K IUI09 query_log_analysis
%P 469--474
%R http://dx.doi.org/10.1109/ITNG.2007.202
%T Using Web Search Logs to Identify Query Classification Terms
%U http://portal.acm.org/citation.cfm?id=1262257.1262320&coll=Portal&dl=GUIDE&CFID=2150660&CFTOKEN=37817046
%X Classification of search queries is a complex and computationally challenging task. Typically, search queries are short, reveal very few features per single query and are therefore a weak source for traditional machine learning. In this paper, we present a method that combines limited manual labeling, computational linguistics and information retrieval to classify a large collection of web search queries. A short set of manually chosen terms that are known a priori to be of interest to a particular class is used to cull a small number of actual queries from a commercial search engine log. These queries are then submitted to a commercial search engine and the returned search results are used to find more class related terms. We examine classification proficiency of the proposed method on a large web search engine query log and show that up to 48% of the unlabeled set could be classified using this method. We discuss results of this research and its implications on the advancement of short text classification.
%@ 0-7695-2776-0
@inproceedings{Taksa2007,
abstract = {Classification of search queries is a complex and computationally challenging task. Typically, search queries are short, reveal very few features per single query and are therefore a weak source for traditional machine learning. In this paper, we present a method that combines limited manual labeling, computational linguistics and information retrieval to classify a large collection of web search queries. A short set of manually chosen terms that are known a priori to be of interest to a particular class is used to cull a small number of actual queries from a commercial search engine log. These queries are then submitted to a commercial search engine and the returned search results are used to find more class related terms. We examine classification proficiency of the proposed method on a large web search engine query log and show that up to 48% of the unlabeled set could be classified using this method. We discuss results of this research and its implications on the advancement of short text classification.},
added-at = {2008-09-09T13:01:06.000+0200},
address = {Washington, DC, USA},
author = {Taksa, Isak and Zelikovitz, Sarah and Spink, Amanda},
biburl = {https://www.bibsonomy.org/bibtex/2a3789a42fefa2b2439ead6bcba744ff4/chriskoerner},
booktitle = {ITNG '07: Proceedings of the International Conference on Information Technology},
description = {Using Web Search Logs to Identify Query Classification Terms},
doi = {http://dx.doi.org/10.1109/ITNG.2007.202},
interhash = {d0d661dffb0e6f5d9f298f9203129e19},
intrahash = {a3789a42fefa2b2439ead6bcba744ff4},
isbn = {0-7695-2776-0},
keywords = {IUI09 query_log_analysis},
pages = {469--474},
publisher = {IEEE Computer Society},
timestamp = {2008-09-09T13:01:06.000+0200},
title = {Using Web Search Logs to Identify Query Classification Terms},
url = {http://portal.acm.org/citation.cfm?id=1262257.1262320&coll=Portal&dl=GUIDE&CFID=2150660&CFTOKEN=37817046},
year = 2007
}