Deduplication is a key operation in integrating data from multiple sources. The main challenge in this task is designing a function that can resolve when a pair of records refer to the same entity in spite of various data inconsistencies. Most existing systems use hand-coded functions. One way to overcome the tedium of hand-coding is to train a classifier to distinguish between duplicates and non-duplicates. The success of this method critically hinges on being able to provide a covering and challenging set of training pairs that bring out the subtlety of deduplication function. This is non-trivial because it requires manually searching for various data inconsistencies between any two records spread apart in large lists.We present our design of a learning-based deduplication system that uses a novel method of interactively discovering challenging training pairs using active learning. Our experiments on real-life datasets show that active learning significantly reduces the number of instances needed to achieve high accuracy. We investigate various design issues that arise in building a system to provide interactive response, fast convergence, and interpretable output.
%0 Conference Paper
%1 775087
%A Sarawagi, Sunita
%A Bhamidipaty, Anuradha
%B KDD '02: Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining
%C New York, NY, USA
%D 2002
%I ACM
%K ActiveLearning Deduplication
%P 269--278
%R http://doi.acm.org/10.1145/775047.775087
%T Interactive deduplication using active learning
%U http://portal.acm.org/citation.cfm?id=775087#
%X Deduplication is a key operation in integrating data from multiple sources. The main challenge in this task is designing a function that can resolve when a pair of records refer to the same entity in spite of various data inconsistencies. Most existing systems use hand-coded functions. One way to overcome the tedium of hand-coding is to train a classifier to distinguish between duplicates and non-duplicates. The success of this method critically hinges on being able to provide a covering and challenging set of training pairs that bring out the subtlety of deduplication function. This is non-trivial because it requires manually searching for various data inconsistencies between any two records spread apart in large lists.We present our design of a learning-based deduplication system that uses a novel method of interactively discovering challenging training pairs using active learning. Our experiments on real-life datasets show that active learning significantly reduces the number of instances needed to achieve high accuracy. We investigate various design issues that arise in building a system to provide interactive response, fast convergence, and interpretable output.
%@ 1-58113-567-X
@inproceedings{775087,
abstract = {Deduplication is a key operation in integrating data from multiple sources. The main challenge in this task is designing a function that can resolve when a pair of records refer to the same entity in spite of various data inconsistencies. Most existing systems use hand-coded functions. One way to overcome the tedium of hand-coding is to train a classifier to distinguish between duplicates and non-duplicates. The success of this method critically hinges on being able to provide a covering and challenging set of training pairs that bring out the subtlety of deduplication function. This is non-trivial because it requires manually searching for various data inconsistencies between any two records spread apart in large lists.We present our design of a learning-based deduplication system that uses a novel method of interactively discovering challenging training pairs using active learning. Our experiments on real-life datasets show that active learning significantly reduces the number of instances needed to achieve high accuracy. We investigate various design issues that arise in building a system to provide interactive response, fast convergence, and interpretable output.},
added-at = {2008-07-01T15:40:12.000+0200},
address = {New York, NY, USA},
author = {Sarawagi, Sunita and Bhamidipaty, Anuradha},
biburl = {https://www.bibsonomy.org/bibtex/20e99f904bab8ee13d9549afe5cc0bf53/fhadiji},
booktitle = {KDD '02: Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining},
description = {Interactive deduplication using active learning},
doi = {http://doi.acm.org/10.1145/775047.775087},
interhash = {eee652079391af625630afd9bf584ac7},
intrahash = {0e99f904bab8ee13d9549afe5cc0bf53},
isbn = {1-58113-567-X},
keywords = {ActiveLearning Deduplication},
location = {Edmonton, Alberta, Canada},
pages = {269--278},
publisher = {ACM},
timestamp = {2009-08-19T09:39:31.000+0200},
title = {Interactive deduplication using active learning},
url = {http://portal.acm.org/citation.cfm?id=775087#},
year = 2002
}