Data sets with multiple, heterogeneous feature spaces occur frequently. We present an abstract framework for integrating multiple feature spaces in the k-means clustering algorithm. Our main ideas are (i) to represent each data object as a tuple of multiple feature vectors, (ii) to assign a suitable (and possibly different) distortion measure to each feature space, (iii) to combine distortions on different feature spaces, in a convex fashion, by assigning (possibly) different relative weights to each, (iv) for a fixed weighting, to cluster using the proposed convex k-means algorithm, and (v) to determine the optimal feature weighting to be the one that yields the clustering that simultaneously minimizes the average within-cluster dispersion and maximizes the average between-cluster dispersion along all the feature spaces. Using precision/recall evaluations and known ground truth classifications, we empirically demonstrate the effectiveness of feature weighting in clustering on several different application domains.
%0 Conference Paper
%1 Modha2002Data
%A Modha, Dharmendra
%A Spangler, Scott
%B Machine Learning
%D 2002
%K clustering machine-learning
%T Feature Weighting in k-Means Clustering
%U http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.23.5686
%V 52
%X Data sets with multiple, heterogeneous feature spaces occur frequently. We present an abstract framework for integrating multiple feature spaces in the k-means clustering algorithm. Our main ideas are (i) to represent each data object as a tuple of multiple feature vectors, (ii) to assign a suitable (and possibly different) distortion measure to each feature space, (iii) to combine distortions on different feature spaces, in a convex fashion, by assigning (possibly) different relative weights to each, (iv) for a fixed weighting, to cluster using the proposed convex k-means algorithm, and (v) to determine the optimal feature weighting to be the one that yields the clustering that simultaneously minimizes the average within-cluster dispersion and maximizes the average between-cluster dispersion along all the feature spaces. Using precision/recall evaluations and known ground truth classifications, we empirically demonstrate the effectiveness of feature weighting in clustering on several different application domains.
@inproceedings{Modha2002Data,
abstract = {Data sets with multiple, heterogeneous feature spaces occur frequently. We present an abstract framework for integrating multiple feature spaces in the k-means clustering algorithm. Our main ideas are (i) to represent each data object as a tuple of multiple feature vectors, (ii) to assign a suitable (and possibly different) distortion measure to each feature space, (iii) to combine distortions on different feature spaces, in a convex fashion, by assigning (possibly) different relative weights to each, (iv) for a fixed weighting, to cluster using the proposed convex k-means algorithm, and (v) to determine the optimal feature weighting to be the one that yields the clustering that simultaneously minimizes the average within-cluster dispersion and maximizes the average between-cluster dispersion along all the feature spaces. Using precision/recall evaluations and known ground truth classifications, we empirically demonstrate the effectiveness of feature weighting in clustering on several different application domains.},
added-at = {2018-12-02T16:09:07.000+0100},
author = {Modha, Dharmendra and Spangler, Scott},
biburl = {https://www.bibsonomy.org/bibtex/26f56e666793a2f264a06a9363c48f124/karthikraman},
booktitle = {Machine Learning},
citeulike-article-id = {6095048},
citeulike-linkout-0 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.23.5686},
interhash = {d7e1612c0c3d90f2622f2581aa489f67},
intrahash = {6f56e666793a2f264a06a9363c48f124},
keywords = {clustering machine-learning},
posted-at = {2009-11-10 15:03:57},
priority = {2},
timestamp = {2018-12-02T16:09:07.000+0100},
title = {Feature Weighting in {k-Means} Clustering},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.23.5686},
volume = 52,
year = 2002
}