Often, in the real world, entities have two or more representations in databases.Duplicate records do not share a common key and/or they contain errors that makeduplicate matching a difficult task. Errors are introduced as the result of transcriptionerrors, incomplete information, lack of standard formats or any combination of thesefactors. In this article, we present a thorough analysis of the literature on duplicaterecord detection. We cover similarity metrics that are commonly used to detect similarfield entries, and we present an extensive set of duplicate detection algorithms thatcan detect approximately duplicate records in a database. We also cover multiple techniques for improving the efficiency and scalability of approximate duplicate detectionalgorithms. We conclude with a coverage of existing tools and with a brief discussionof the big open problems in the area.
%0 Journal Article
%1 Elmagarmid2007
%A Elmagarmid, Ahmed K
%A Ipeirotis, Panagiotis G.
%A Verykios, Vassilios S
%C Los Alamitos, CA, USA
%D 2007
%I IEEE INSTITUTE OF ELECTRICAL AND ELECTRONICS
%J IEEE Trans. Knowl. Data Eng.
%K data_cleaning data_deduplication data_integration database_hardening duplicate_detection entity_matching entity_resolution fuzzy_duplicate_detection identity_uncertainty instance_identification name_matching record_linkage phd
%N 1
%P 1--16
%R 10.1109/TKDE.2007.9
%T Duplicate Record Detection: A Survey
%U http://doi.ieeecomputersociety.org/10.1109/TKDE.2007.9
%V 19
%X Often, in the real world, entities have two or more representations in databases.Duplicate records do not share a common key and/or they contain errors that makeduplicate matching a difficult task. Errors are introduced as the result of transcriptionerrors, incomplete information, lack of standard formats or any combination of thesefactors. In this article, we present a thorough analysis of the literature on duplicaterecord detection. We cover similarity metrics that are commonly used to detect similarfield entries, and we present an extensive set of duplicate detection algorithms thatcan detect approximately duplicate records in a database. We also cover multiple techniques for improving the efficiency and scalability of approximate duplicate detectionalgorithms. We conclude with a coverage of existing tools and with a brief discussionof the big open problems in the area.
@article{Elmagarmid2007,
abstract = {Often, in the real world, entities have two or more representations in databases.Duplicate records do not share a common key and/or they contain errors that makeduplicate matching a difficult task. Errors are introduced as the result of transcriptionerrors, incomplete information, lack of standard formats or any combination of thesefactors. In this article, we present a thorough analysis of the literature on duplicaterecord detection. We cover similarity metrics that are commonly used to detect similarfield entries, and we present an extensive set of duplicate detection algorithms thatcan detect approximately duplicate records in a database. We also cover multiple techniques for improving the efficiency and scalability of approximate duplicate detectionalgorithms. We conclude with a coverage of existing tools and with a brief discussionof the big open problems in the area.},
added-at = {2013-12-17T09:45:36.000+0100},
address = {Los Alamitos, CA, USA},
author = {Elmagarmid, Ahmed K and Ipeirotis, Panagiotis G. and Verykios, Vassilios S},
biburl = {https://www.bibsonomy.org/bibtex/20eba2756e188159f0d2247ddef86bbdc/jullybobble},
doi = {10.1109/TKDE.2007.9},
file = {:Users/julien.gaugaz/Dropbox/Papers/Mendeley Desktop/2007/Elmagarmid, Ipeirotis, Verykios - 2007 - Duplicate Record Detection A Survey.pdf:pdf},
interhash = {c8603198a5bd3d2e571462e08f50e12b},
intrahash = {0eba2756e188159f0d2247ddef86bbdc},
issn = {1041-4347},
journal = {IEEE Trans. Knowl. Data Eng.},
keywords = {data_cleaning data_deduplication data_integration database_hardening duplicate_detection entity_matching entity_resolution fuzzy_duplicate_detection identity_uncertainty instance_identification name_matching record_linkage phd},
month = jan,
number = 1,
pages = {1--16},
publisher = {IEEE INSTITUTE OF ELECTRICAL AND ELECTRONICS},
timestamp = {2014-07-27T15:43:19.000+0200},
title = {{Duplicate Record Detection: A Survey}},
url = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2007.9},
volume = 19,
year = 2007
}