@article{Zhang1996,
title = {Birch: an efficient data clustering method for very large databases},
author = {Tian Zhang and Raghu Ramakrishnan and Miron Livny},
journal = {ACM SIGMOD Record},
month = {June},
number = {2},
pages = {103--114},
url = {http://www.cs.uiuc.edu/homes/hanj/refs/papers/zhang96.pdf},
volume = {25},
year = {1996},
abstract = {Finding useful patterns in large datasets has attracted considerable
interest recently, and one of the most widely studied problems in
this area is the identification of clusters, or densely populated
regions, in a multi-dimensional dataset. Prior work does not adequately
address the problem of large datasets and minimization of I/O costs.This
paper presents a data clustering method named BIRCH (Balanced Iterative
Reducing and Clustering using Hierarchies), and demonstrates that
it is especially suitable for very large databases. BIRCH incrementally
and dynamically clusters incoming multi-dimensional metric data points
to try to produce the best quality clustering with the available
resources (i.e., available memory and time constraints). BIRCH can
typically find a good clustering with a single scan of the data,
and improve the quality further with a few additional scans. BIRCH
is also the first clustering algorithm proposed in the database area
to handle "noise" (data points that are not part of the underlying
pattern) effectively.We evaluate BIRCH's time/space efficiency, data
input order sensitivity, and clustering quality through several experiments.
We also present a performance comparisons of BIRCH versus CLARANS,
a clustering method proposed recently for large datasets, and show
that BIRCH is consistently superior.},
keywords = {Clustering }
}