A simple and computationally fast procedure is proposed for screening a large number of variables prior to cluster analysis. Each variable is considered in turn, the sample is divided into the two groups that maximise the ratio of between-group to within-group sum of squares for that variable, and the achieved value of this ratio is tested to see if it is significantly greater than what would be expected when partitioning a sample from a single homogeneous population. Those variables that achieve significance are then used in the cluster analysis. It is suggested that significance levels be assessed using a Monte Carlo computational procedure; by assuming within-group normality an analytical approximation is derived, but caution in its use is advocated. Computational details are provided for both the partitioning and the testing. The procedure is applied to several microarray data sets, showing that it can often achieve good results both quickly and simply.
Description
ScienceDirect - Computational Statistics & Data Analysis : A simple method for screening variables before clustering microarray data
%0 Journal Article
%1 Krzanowski20092747
%A Krzanowski, Wojtek J.
%A Hand, David J.
%D 2009
%J Computational Statistics & Data Analysis
%K classification clustering krzanowski multivariate robust screening statistics
%N 7
%P 2747 - 2753
%R 10.1016/j.csda.2009.02.001
%T A simple method for screening variables before clustering microarray data
%U http://www.sciencedirect.com/science/article/pii/S016794730900036X
%V 53
%X A simple and computationally fast procedure is proposed for screening a large number of variables prior to cluster analysis. Each variable is considered in turn, the sample is divided into the two groups that maximise the ratio of between-group to within-group sum of squares for that variable, and the achieved value of this ratio is tested to see if it is significantly greater than what would be expected when partitioning a sample from a single homogeneous population. Those variables that achieve significance are then used in the cluster analysis. It is suggested that significance levels be assessed using a Monte Carlo computational procedure; by assuming within-group normality an analytical approximation is derived, but caution in its use is advocated. Computational details are provided for both the partitioning and the testing. The procedure is applied to several microarray data sets, showing that it can often achieve good results both quickly and simply.
@article{Krzanowski20092747,
abstract = {A simple and computationally fast procedure is proposed for screening a large number of variables prior to cluster analysis. Each variable is considered in turn, the sample is divided into the two groups that maximise the ratio of between-group to within-group sum of squares for that variable, and the achieved value of this ratio is tested to see if it is significantly greater than what would be expected when partitioning a sample from a single homogeneous population. Those variables that achieve significance are then used in the cluster analysis. It is suggested that significance levels be assessed using a Monte Carlo computational procedure; by assuming within-group normality an analytical approximation is derived, but caution in its use is advocated. Computational details are provided for both the partitioning and the testing. The procedure is applied to several microarray data sets, showing that it can often achieve good results both quickly and simply.},
added-at = {2011-08-25T12:01:56.000+0200},
author = {Krzanowski, Wojtek J. and Hand, David J.},
biburl = {https://www.bibsonomy.org/bibtex/2f195d6ed12aea52123bcc2164b47726d/vivion},
description = {ScienceDirect - Computational Statistics & Data Analysis : A simple method for screening variables before clustering microarray data},
doi = {10.1016/j.csda.2009.02.001},
interhash = {b132cb2d40448f995e3eb33f77986cb9},
intrahash = {f195d6ed12aea52123bcc2164b47726d},
issn = {0167-9473},
journal = {Computational Statistics & Data Analysis},
keywords = {classification clustering krzanowski multivariate robust screening statistics},
number = 7,
pages = {2747 - 2753},
timestamp = {2011-08-25T12:06:52.000+0200},
title = {A simple method for screening variables before clustering microarray data},
url = {http://www.sciencedirect.com/science/article/pii/S016794730900036X},
volume = 53,
year = 2009
}