Often, in the real world, entities have two or more representations
in databases. Duplicate records do not share a common key and/or
they contain errors that make duplicate matching a difficult task.
Errors are introduced as the result of transcription errors, incomplete
information, lack of standard formats, or any combination of these
factors. In this paper, we present a thorough analysis of the literature
on duplicate record detection. We cover similarity metrics that are
commonly used to detect similar field entries, and we present an
extensive set of duplicate detection algorithms that can detect approximately
duplicate records in a database. We also cover multiple techniques
for improving the efficiency and scalability of approximate duplicate
detection algorithms. We conclude with coverage of existing tools
and with a brief discussion of the big open problems in the area.
%0 Journal Article
%1 Elmagarmid2007
%A Elmagarmid, Ahmed K.
%A Ipeirotis, Panagiotis G.
%A Verykios, Vassilios S.
%C Piscataway, NJ, USA
%D 2007
%I IEEE Educational Activities Department
%J IEEE Trans. on Knowl. and Data Eng.
%K imported
%N 1
%P 1--16
%R http://dx.doi.org/10.1109/TKDE.2007.9
%T Duplicate Record Detection: A Survey
%V 19
%X Often, in the real world, entities have two or more representations
in databases. Duplicate records do not share a common key and/or
they contain errors that make duplicate matching a difficult task.
Errors are introduced as the result of transcription errors, incomplete
information, lack of standard formats, or any combination of these
factors. In this paper, we present a thorough analysis of the literature
on duplicate record detection. We cover similarity metrics that are
commonly used to detect similar field entries, and we present an
extensive set of duplicate detection algorithms that can detect approximately
duplicate records in a database. We also cover multiple techniques
for improving the efficiency and scalability of approximate duplicate
detection algorithms. We conclude with coverage of existing tools
and with a brief discussion of the big open problems in the area.
@article{Elmagarmid2007,
abstract = {Often, in the real world, entities have two or more representations
in databases. Duplicate records do not share a common key and/or
they contain errors that make duplicate matching a difficult task.
Errors are introduced as the result of transcription errors, incomplete
information, lack of standard formats, or any combination of these
factors. In this paper, we present a thorough analysis of the literature
on duplicate record detection. We cover similarity metrics that are
commonly used to detect similar field entries, and we present an
extensive set of duplicate detection algorithms that can detect approximately
duplicate records in a database. We also cover multiple techniques
for improving the efficiency and scalability of approximate duplicate
detection algorithms. We conclude with coverage of existing tools
and with a brief discussion of the big open problems in the area.},
added-at = {2013-08-04T14:35:14.000+0200},
address = {Piscataway, NJ, USA},
author = {Elmagarmid, Ahmed K. and Ipeirotis, Panagiotis G. and Verykios, Vassilios S.},
biburl = {https://www.bibsonomy.org/bibtex/21658695b2bdc7d1bf748046a52eb291c/francesco.k},
doi = {http://dx.doi.org/10.1109/TKDE.2007.9},
file = {:dd-survey22.pdf:PDF},
interhash = {c8603198a5bd3d2e571462e08f50e12b},
intrahash = {1658695b2bdc7d1bf748046a52eb291c},
issn = {1041-4347},
journal = {IEEE Trans. on Knowl. and Data Eng.},
keywords = {imported},
number = 1,
pages = {1--16},
publisher = {IEEE Educational Activities Department},
timestamp = {2013-08-04T14:35:14.000+0200},
title = {Duplicate Record Detection: A Survey},
volume = 19,
year = 2007
}