We present a generalization of frequent itemsets allowing for the notion of errors in the itemset definition. We motivate the problem and present an efficient algorithm that identifies error-tolerant frequent clusters of items in transactional data (customer-purchase data, web browsing data, text, etc.). The algorithm exploits sparseness of the underlying data to find large groups of items that are correlated over database records (rows). The notion of transaction coverage allows us to extend the algorithm and view it as a fast clustering algorithm for discovering segments of similar transactions in binary sparse data. We evaluate the new algorithm on three real-world applications: clustering high-dimensional data, query selectivity estimation and collaborative filtering. Results show that the algorithm consistently uncovers structure in large sparse databases that other traditional clustering algorithms fail to find.
Description
Efficient discovery of error-tolerant frequent itemsets in high dimensions
%0 Conference Paper
%1 Yang01
%A Yang, Cheng
%A Fayyad, Usama
%A Bradley, Paul S.
%B KDD '01: Proceedings of the seventh ACM SIGKDD international conference on Knowledge discovery and data mining
%C New York, NY, USA
%D 2001
%I ACM
%K faultTolerant frequentSetMining listing patternMining
%P 194--203
%R http://doi.acm.org/10.1145/502512.502539
%T Efficient discovery of error-tolerant frequent itemsets in high dimensions
%U http://portal.acm.org/citation.cfm?id=502512.502539
%X We present a generalization of frequent itemsets allowing for the notion of errors in the itemset definition. We motivate the problem and present an efficient algorithm that identifies error-tolerant frequent clusters of items in transactional data (customer-purchase data, web browsing data, text, etc.). The algorithm exploits sparseness of the underlying data to find large groups of items that are correlated over database records (rows). The notion of transaction coverage allows us to extend the algorithm and view it as a fast clustering algorithm for discovering segments of similar transactions in binary sparse data. We evaluate the new algorithm on three real-world applications: clustering high-dimensional data, query selectivity estimation and collaborative filtering. Results show that the algorithm consistently uncovers structure in large sparse databases that other traditional clustering algorithms fail to find.
%@ 1-58113-391-X
@inproceedings{Yang01,
abstract = {We present a generalization of frequent itemsets allowing for the notion of errors in the itemset definition. We motivate the problem and present an efficient algorithm that identifies error-tolerant frequent clusters of items in transactional data (customer-purchase data, web browsing data, text, etc.). The algorithm exploits sparseness of the underlying data to find large groups of items that are correlated over database records (rows). The notion of transaction coverage allows us to extend the algorithm and view it as a fast clustering algorithm for discovering segments of similar transactions in binary sparse data. We evaluate the new algorithm on three real-world applications: clustering high-dimensional data, query selectivity estimation and collaborative filtering. Results show that the algorithm consistently uncovers structure in large sparse databases that other traditional clustering algorithms fail to find.},
added-at = {2008-09-11T00:35:46.000+0200},
address = {New York, NY, USA},
author = {Yang, Cheng and Fayyad, Usama and Bradley, Paul S.},
biburl = {https://www.bibsonomy.org/bibtex/28322833c022fed1df83753d173b7cc67/mboley},
booktitle = {KDD '01: Proceedings of the seventh ACM SIGKDD international conference on Knowledge discovery and data mining},
description = {Efficient discovery of error-tolerant frequent itemsets in high dimensions},
doi = {http://doi.acm.org/10.1145/502512.502539},
interhash = {a5ae8ffa530d74635fd6e6857d7a63c3},
intrahash = {8322833c022fed1df83753d173b7cc67},
isbn = {1-58113-391-X},
keywords = {faultTolerant frequentSetMining listing patternMining},
location = {San Francisco, California},
pages = {194--203},
publisher = {ACM},
timestamp = {2008-09-11T00:38:18.000+0200},
title = {Efficient discovery of error-tolerant frequent itemsets in high dimensions},
url = {http://portal.acm.org/citation.cfm?id=502512.502539},
year = 2001
}