| Authors: |
Moises G. {de Carvalho}
and Marcos Andre Goncalves
and Alberto H. F. Laender
and Altigran S. {da Silva}
|
| URL: |
http://delivery.acm.org/10.1145/1150000/1141760/p41-decarvalho.pdf?key1=1141760&key2=6906456911&coll=GUIDE&dl=GUIDE&CFID=45325455&CFTOKEN=75817203 |
| Tags: |
Deduplication,
Digital
Libraries
algorithms,
genetic
programming,
|
| Abstract: |
Identifying record replicas in digital libraries and
other types of digital repositories is fundamental to
improve the quality of their content and services as
well as to yield eventual sharing efforts. Several
deduplication strategies are available, but most of
them rely on manually chosen settings to combine
evidence used to identify records as being replicas. In
this paper, we present the results of experiments we
have carried out with a novel machine learning approach
we have proposed for the de duplication problem. This
approach, based on genetic programming (GP), is able to
automatically generate similarity functions to identify
record replicas in a given repository. The generated
similarity functions properly combine and weight the
best evidence available among the record fields in
order to tell when two distinct records represent the
same real-world entity. The results of the experiments
show that our approach outperforms the baseline method
by Fellegi and Sunter by more than 12percent when
identifying replicas in a data set containing
researcher's personal data, and by more than 7percent,
in a data set with article citation data |
@inproceedings{deCarvalho:2006:JCDL,
title = {Learning to deduplicate},
address = {Chapel Hill, NC, USA},
author = {Moises G. {de Carvalho} and Marcos Andre Goncalves and Alberto H. F. Laender and Altigran S. {da Silva}},
booktitle = {Proceedings of the 6th ACM/IEEE-CS Joint Conference on
Digital Libraries, JCDL '06},
month = {June},
pages = {41--50},
publisher = {IEEE},
url = {http://delivery.acm.org/10.1145/1150000/1141760/p41-decarvalho.pdf?key1=1141760&key2=6906456911&coll=GUIDE&dl=GUIDE&CFID=45325455&CFTOKEN=75817203},
year = {2006},
abstract = {Identifying record replicas in digital libraries and
other types of digital repositories is fundamental to
improve the quality of their content and services as
well as to yield eventual sharing efforts. Several
deduplication strategies are available, but most of
them rely on manually chosen settings to combine
evidence used to identify records as being replicas. In
this paper, we present the results of experiments we
have carried out with a novel machine learning approach
we have proposed for the de duplication problem. This
approach, based on genetic programming (GP), is able to
automatically generate similarity functions to identify
record replicas in a given repository. The generated
similarity functions properly combine and weight the
best evidence available among the record fields in
order to tell when two distinct records represent the
same real-world entity. The results of the experiments
show that our approach outperforms the baseline method
by Fellegi and Sunter by more than 12percent when
identifying replicas in a data set containing
researcher's personal data, and by more than 7percent,
in a data set with article citation data},
size = {10 pages}, isbn = {1-59593-354-9}, notes = {Comput. Sci. Dept., Fed. Univ. of Minas Gerais, Belo
Horizonte}, doi = {doi:10.1145/1141753.1141760},
keywords = {Deduplication, Digital Libraries algorithms, genetic programming, }
}