Most data mining algorithms require the setting of many input parameters. Two main dangers of working with parameter-laden algorithms are the following. First, incorrect settings may cause an algorithm to fail in finding the true patterns. Second, a perhaps more insidious problem is that the algorithm may report spurious patterns that do not really exist, or greatly overestimate the significance of the reported patterns. This is especially likely when the user fails to understand the role of parameters in the data mining process.Data mining algorithms should have as few parameters as possible, ideally none. A parameter-free algorithm would limit our ability to impose our prejudices, expectations, and presumptions on the problem at hand, and would let the data itself speak to us. In this work, we show that recent results in bioinformatics and computational theory hold great promise for a parameter-free data-mining paradigm. The results are motivated by observations in Kolmogorov complexity theory. However, as a practical matter, they can be implemented using any off-the-shelf compression algorithm with the addition of just a dozen or so lines of code. We will show that this approach is competitive or superior to the state-of-the-art approaches in anomaly/interestingness detection, classification, and clustering with empirical tests on time series/DNA/text/video datasets.
%0 Conference Paper
%1 1014077
%A Keogh, Eamonn
%A Lonardi, Stefano
%A Ratanamahatana, Chotirat Ann
%B KDD '04: Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining
%C New York, NY, USA
%D 2004
%I ACM
%K anomaly-detection compression-distance imported machine-learning zip
%P 206--215
%R http://doi.acm.org/10.1145/1014052.1014077
%T Towards parameter-free data mining
%U http://www.cs.ucr.edu/~eamonn/SIGKDD_2004_long.pdf
%X Most data mining algorithms require the setting of many input parameters. Two main dangers of working with parameter-laden algorithms are the following. First, incorrect settings may cause an algorithm to fail in finding the true patterns. Second, a perhaps more insidious problem is that the algorithm may report spurious patterns that do not really exist, or greatly overestimate the significance of the reported patterns. This is especially likely when the user fails to understand the role of parameters in the data mining process.Data mining algorithms should have as few parameters as possible, ideally none. A parameter-free algorithm would limit our ability to impose our prejudices, expectations, and presumptions on the problem at hand, and would let the data itself speak to us. In this work, we show that recent results in bioinformatics and computational theory hold great promise for a parameter-free data-mining paradigm. The results are motivated by observations in Kolmogorov complexity theory. However, as a practical matter, they can be implemented using any off-the-shelf compression algorithm with the addition of just a dozen or so lines of code. We will show that this approach is competitive or superior to the state-of-the-art approaches in anomaly/interestingness detection, classification, and clustering with empirical tests on time series/DNA/text/video datasets.
%@ 1-58113-888-1
@inproceedings{1014077,
abstract = {Most data mining algorithms require the setting of many input parameters. Two main dangers of working with parameter-laden algorithms are the following. First, incorrect settings may cause an algorithm to fail in finding the true patterns. Second, a perhaps more insidious problem is that the algorithm may report spurious patterns that do not really exist, or greatly overestimate the significance of the reported patterns. This is especially likely when the user fails to understand the role of parameters in the data mining process.Data mining algorithms should have as few parameters as possible, ideally none. A parameter-free algorithm would limit our ability to impose our prejudices, expectations, and presumptions on the problem at hand, and would let the data itself speak to us. In this work, we show that recent results in bioinformatics and computational theory hold great promise for a parameter-free data-mining paradigm. The results are motivated by observations in Kolmogorov complexity theory. However, as a practical matter, they can be implemented using any off-the-shelf compression algorithm with the addition of just a dozen or so lines of code. We will show that this approach is competitive or superior to the state-of-the-art approaches in anomaly/interestingness detection, classification, and clustering with empirical tests on time series/DNA/text/video datasets.},
added-at = {2009-01-26T19:08:37.000+0100},
address = {New York, NY, USA},
author = {Keogh, Eamonn and Lonardi, Stefano and Ratanamahatana, Chotirat Ann},
biburl = {https://www.bibsonomy.org/bibtex/2ac040da30506021c4aef71dbce9eaac1/gromgull},
booktitle = {KDD '04: Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining},
description = {Towards parameter-free data mining},
doi = {http://doi.acm.org/10.1145/1014052.1014077},
interhash = {4bc9dcd553113ec9c103b8252e9d9bed},
intrahash = {ac040da30506021c4aef71dbce9eaac1},
isbn = {1-58113-888-1},
keywords = {anomaly-detection compression-distance imported machine-learning zip},
location = {Seattle, WA, USA},
pages = {206--215},
publisher = {ACM},
timestamp = {2009-08-12T12:24:33.000+0200},
title = {Towards parameter-free data mining},
url = {http://www.cs.ucr.edu/~eamonn/SIGKDD_2004_long.pdf},
year = 2004
}