A Comparison of String Distance Metrics for Name-Matching Tasks.
W. Cohen, P. Ravikumar, and S. Fienberg. Proceedings of IJCAI-03 Workshop on Information Integration, page 73--78. (August 2003)
Abstract
Using an open-source, Java toolkit of name-matching methods, we experimentally compare string distance metrics on the task of matching entity names. We investigate a number of different metrics proposed by different communities, including edit-distance metrics, fast heuristic string comparators , token-based distance metrics, and hybrid methods. Overall, the best-performing method is a hybrid scheme combining a TFIDF weighting scheme, which is widely used in information retrieval, with the Jaro-Winkler string-distance scheme, which was developed in the probabilistic record linkage community.
%0 Conference Paper
%1 Cohen2003
%A Cohen, William W.
%A Ravikumar, Pradeep
%A Fienberg, Stephen E.
%B Proceedings of IJCAI-03 Workshop on Information Integration
%D 2003
%K ir visual-information-seeking
%P 73--78
%T A Comparison of String Distance Metrics for Name-Matching Tasks.
%X Using an open-source, Java toolkit of name-matching methods, we experimentally compare string distance metrics on the task of matching entity names. We investigate a number of different metrics proposed by different communities, including edit-distance metrics, fast heuristic string comparators , token-based distance metrics, and hybrid methods. Overall, the best-performing method is a hybrid scheme combining a TFIDF weighting scheme, which is widely used in information retrieval, with the Jaro-Winkler string-distance scheme, which was developed in the probabilistic record linkage community.
@inproceedings{Cohen2003,
abstract = {Using an open-source, Java toolkit of name-matching methods, we experimentally compare string distance metrics on the task of matching entity names. We investigate a number of different metrics proposed by different communities, including edit-distance metrics, fast heuristic string comparators , token-based distance metrics, and hybrid methods. Overall, the best-performing method is a hybrid scheme combining a TFIDF weighting scheme, which is widely used in information retrieval, with the Jaro-Winkler string-distance scheme, which was developed in the probabilistic record linkage community.},
added-at = {2009-03-12T15:42:50.000+0100},
author = {Cohen, William W. and Ravikumar, Pradeep and Fienberg, Stephen E.},
biburl = {https://www.bibsonomy.org/bibtex/23bb9c395b0e75b41b95947ca73b897f8/lillejul},
booktitle = {Proceedings of IJCAI-03 Workshop on Information Integration},
citeulike-article-id = {1467939},
interhash = {b918a22c0ac156bcd7114e8361377773},
intrahash = {3bb9c395b0e75b41b95947ca73b897f8},
keywords = {ir visual-information-seeking},
month = {August},
pages = {73--78},
posted-at = {2007-07-19 22:34:55},
priority = {0},
timestamp = {2009-03-12T15:42:53.000+0100},
title = {A Comparison of String Distance Metrics for Name-Matching Tasks.},
year = 2003
}