We propose a simple statistical model for the frequency of occurrence of features in a stream of text. Adoption of this model allows us to use classical significance tests to filter the stream for interesting events. We tested the model by building a system and running it on a news corpus. By a subjective evaluation, the system worked remarkably well: almost all of the groups of identified tokens corresponded to news stories and were appropriately placed in time. A preliminary objective evaluation was also used to measure the quality of the system and it showed some of the weaknesses and the power of our approach.
Description
Extracting significant time varying features from text
%0 Conference Paper
%1 swan99extracting
%A Swan, Russell
%A Allan, James
%B CIKM '99: Proceedings of the eighth international conference on Information and knowledge management
%C New York, NY, USA
%D 1999
%I ACM
%K information_extraction nlp text_mining time_series
%P 38--45
%R http://doi.acm.org/10.1145/319950.319956
%T Extracting significant time varying features from text
%U http://portal.acm.org/citation.cfm?id=319956#
%X We propose a simple statistical model for the frequency of occurrence of features in a stream of text. Adoption of this model allows us to use classical significance tests to filter the stream for interesting events. We tested the model by building a system and running it on a news corpus. By a subjective evaluation, the system worked remarkably well: almost all of the groups of identified tokens corresponded to news stories and were appropriately placed in time. A preliminary objective evaluation was also used to measure the quality of the system and it showed some of the weaknesses and the power of our approach.
%@ 1-58113-146-1
@inproceedings{swan99extracting,
abstract = {We propose a simple statistical model for the frequency of occurrence of features in a stream of text. Adoption of this model allows us to use classical significance tests to filter the stream for interesting events. We tested the model by building a system and running it on a news corpus. By a subjective evaluation, the system worked remarkably well: almost all of the groups of identified tokens corresponded to news stories and were appropriately placed in time. A preliminary objective evaluation was also used to measure the quality of the system and it showed some of the weaknesses and the power of our approach.},
added-at = {2008-06-10T11:30:55.000+0200},
address = {New York, NY, USA},
author = {Swan, Russell and Allan, James},
biburl = {https://www.bibsonomy.org/bibtex/262021e5e9c5f56cef8474b9431076005/hkorte},
booktitle = {CIKM '99: Proceedings of the eighth international conference on Information and knowledge management},
description = {Extracting significant time varying features from text},
doi = {http://doi.acm.org/10.1145/319950.319956},
interhash = {948f930ca71b28c690753291b98b0560},
intrahash = {62021e5e9c5f56cef8474b9431076005},
isbn = {1-58113-146-1},
keywords = {information_extraction nlp text_mining time_series},
location = {Kansas City, Missouri, United States},
pages = {38--45},
publisher = {ACM},
timestamp = {2008-09-30T11:47:41.000+0200},
title = {Extracting significant time varying features from text},
url = {http://portal.acm.org/citation.cfm?id=319956#},
year = 1999
}