Near-duplicate Detection by Instance-level Constrained Clustering
H. Yang, и J. Callan. Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, стр. 421--428. New York, NY, USA, ACM, (2006)
DOI: 10.1145/1148170.1148243
Аннотация
For the task of near-duplicated document detection, both traditional fingerprinting techniques used in database community and bag-of-word comparison approaches used in information retrieval community are not sufficiently accurate. This is due to the fact that the characteristics of near-duplicated documents are different from that of both älmost-identical" documents in the data cleaning task and "relevant" documents in the search task. This paper presents an instance-level constrained clustering approach for near-duplicate detection. The framework incorporates information such as document attributes and content structure into the clustering process to form near-duplicate clusters. Gathered from several collections of public comments sent to U.S. government agencies on proposed new regulations, the experimental results demonstrate that our approach outperforms other near-duplicate detection algorithms and as about as effective as human assessors.
%0 Conference Paper
%1 citeulike:2295267
%A Yang, Hui
%A Callan, Jamie
%B Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval
%C New York, NY, USA
%D 2006
%I ACM
%K clustering
%P 421--428
%R 10.1145/1148170.1148243
%T Near-duplicate Detection by Instance-level Constrained Clustering
%U http://dx.doi.org/10.1145/1148170.1148243
%X For the task of near-duplicated document detection, both traditional fingerprinting techniques used in database community and bag-of-word comparison approaches used in information retrieval community are not sufficiently accurate. This is due to the fact that the characteristics of near-duplicated documents are different from that of both älmost-identical" documents in the data cleaning task and "relevant" documents in the search task. This paper presents an instance-level constrained clustering approach for near-duplicate detection. The framework incorporates information such as document attributes and content structure into the clustering process to form near-duplicate clusters. Gathered from several collections of public comments sent to U.S. government agencies on proposed new regulations, the experimental results demonstrate that our approach outperforms other near-duplicate detection algorithms and as about as effective as human assessors.
%@ 1-59593-369-7
@inproceedings{citeulike:2295267,
abstract = {{For the task of near-duplicated document detection, both traditional fingerprinting techniques used in database community and bag-of-word comparison approaches used in information retrieval community are not sufficiently accurate. This is due to the fact that the characteristics of near-duplicated documents are different from that of both "almost-identical" documents in the data cleaning task and "relevant" documents in the search task. This paper presents an instance-level constrained clustering approach for near-duplicate detection. The framework incorporates information such as document attributes and content structure into the clustering process to form near-duplicate clusters. Gathered from several collections of public comments sent to U.S. government agencies on proposed new regulations, the experimental results demonstrate that our approach outperforms other near-duplicate detection algorithms and as about as effective as human assessors.}},
added-at = {2018-03-19T12:24:51.000+0100},
address = {New York, NY, USA},
author = {Yang, Hui and Callan, Jamie},
biburl = {https://www.bibsonomy.org/bibtex/2f447fc375c8675f5dc51f570ef52bddc/aho},
booktitle = {Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval},
citeulike-article-id = {2295267},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1148243},
citeulike-linkout-1 = {http://dx.doi.org/10.1145/1148170.1148243},
doi = {10.1145/1148170.1148243},
interhash = {0703044e3abd1580680e66f2355813c6},
intrahash = {f447fc375c8675f5dc51f570ef52bddc},
isbn = {1-59593-369-7},
keywords = {clustering},
location = {Seattle, Washington, USA},
pages = {421--428},
posted-at = {2011-01-11 17:15:07},
priority = {2},
publisher = {ACM},
series = {SIGIR '06},
timestamp = {2018-03-19T12:24:51.000+0100},
title = {{Near-duplicate Detection by Instance-level Constrained Clustering}},
url = {http://dx.doi.org/10.1145/1148170.1148243},
year = 2006
}