We consider the problem of determining the structure of clustered data, without prior
knowledge of the number of clusters or any other information about their composition. Data
are represented by a mixture model in which each component corresponds to a different
cluster. Models with varying geometric properties are obtained through Gaussian components
with different parameterizations and cross-cluster constraints. Noise and outliers can
be modeled by adding a Poisson process component. Partitions are determined by the EM
(expectation-maximization) algorithm for maximum likelihood, with initial values from agglomerative
hierarchical clustering.
Models are compared using an approximation to the Bayes factor based on the Bayesian
Information Criterion (BIC); unlike significance tests, this allows comparison of more than
two models at the same time, and removes the restriction that the models compared be
nested. The problems of determining the number of clusters and the clustering method
are solved simultaneously by choosing the best model. Moreover, the EM result provides a
measure of uncertainty about the associated classification of each data point.
Examples are given, showing that this approach can give performance that is much better
than standard procedures, which often fail to identify groups that are either overlapping or
of varying sizes and shapes.
%0 Journal Article
%1 fraley199801
%A Fraley, Chris
%A Raftery, Adrian E.
%D 1998
%I Oxford University Press
%J The Computer Journal
%K analysis cluster model-based
%N 8
%P 578-588
%T How Many Clusters? Which Clustering Method? Answers Via Model-Based Cluster Analysis
%U http://www3.oup.co.uk/computer_journal/hdb/Volume_41/Issue_08/Fraley.pdf
%V 41
%X We consider the problem of determining the structure of clustered data, without prior
knowledge of the number of clusters or any other information about their composition. Data
are represented by a mixture model in which each component corresponds to a different
cluster. Models with varying geometric properties are obtained through Gaussian components
with different parameterizations and cross-cluster constraints. Noise and outliers can
be modeled by adding a Poisson process component. Partitions are determined by the EM
(expectation-maximization) algorithm for maximum likelihood, with initial values from agglomerative
hierarchical clustering.
Models are compared using an approximation to the Bayes factor based on the Bayesian
Information Criterion (BIC); unlike significance tests, this allows comparison of more than
two models at the same time, and removes the restriction that the models compared be
nested. The problems of determining the number of clusters and the clustering method
are solved simultaneously by choosing the best model. Moreover, the EM result provides a
measure of uncertainty about the associated classification of each data point.
Examples are given, showing that this approach can give performance that is much better
than standard procedures, which often fail to identify groups that are either overlapping or
of varying sizes and shapes.
@article{fraley199801,
abstract = {We consider the problem of determining the structure of clustered data, without prior
knowledge of the number of clusters or any other information about their composition. Data
are represented by a mixture model in which each component corresponds to a different
cluster. Models with varying geometric properties are obtained through Gaussian components
with different parameterizations and cross-cluster constraints. Noise and outliers can
be modeled by adding a Poisson process component. Partitions are determined by the EM
(expectation-maximization) algorithm for maximum likelihood, with initial values from agglomerative
hierarchical clustering.
Models are compared using an approximation to the Bayes factor based on the Bayesian
Information Criterion (BIC); unlike significance tests, this allows comparison of more than
two models at the same time, and removes the restriction that the models compared be
nested. The problems of determining the number of clusters and the clustering method
are solved simultaneously by choosing the best model. Moreover, the EM result provides a
measure of uncertainty about the associated classification of each data point.
Examples are given, showing that this approach can give performance that is much better
than standard procedures, which often fail to identify groups that are either overlapping or
of varying sizes and shapes.},
added-at = {2009-08-12T14:36:19.000+0200},
author = {Fraley, Chris and Raftery, Adrian E.},
biburl = {https://www.bibsonomy.org/bibtex/2d330ced53c47b9e1e89ea2e016644fb0/neongod},
interhash = {37917204d7d116a040b432704fd5be09},
intrahash = {d330ced53c47b9e1e89ea2e016644fb0},
journal = {The Computer Journal},
keywords = {analysis cluster model-based},
language = {english},
number = 8,
pages = {578-588},
publisher = {Oxford University Press},
timestamp = {2009-08-12T14:36:19.000+0200},
title = {How Many Clusters? Which Clustering Method? Answers Via Model-Based Cluster Analysis},
url = {http://www3.oup.co.uk/computer_journal/hdb/Volume_41/Issue_08/Fraley.pdf},
volume = 41,
year = 1998
}