MOTIVATION: Duplicate publication impacts the quality of the scientific
corpus, has been difficult to detect, and studies this far have been
limited in scope and size. Using text similarity searches, we were
able to identify signatures of duplicate citations among a body of
abstracts. RESULTS: A sample of 62,213 Medline citations was examined
and a database of manually verified duplicate citations was created
to study author publication behavior. We found that 0.04\% of the
citations with no shared authors were highly similar and are thus
potential cases of plagiarism. 1.35\% with shared authors were sufficiently
similar to be considered a duplicate. Extrapolating, this would correspond
to 3500 and 117,500 duplicate citations in total, respectively. AVAILABILITY:
eTBLAST, an automated citation matching tool, and Déjà vu,
the duplicate citation database, are freely available at http://invention.swmed.edu/
and http://spore.swmed.edu/dejavu
%0 Journal Article
%1 Errami2008
%A Errami, Mounir
%A Hicks, Justin M
%A Fisher, Wayne
%A Trusty, David
%A Wren, Jonathan D
%A Long, Tara C
%A Garner, Harold R
%D 2008
%J Bioinformatics (Oxford, England)
%K Bibliometrics,MEDLINE,MEDLINE: Controlled Headings,Natural Language Processing,Periodicals Subject Topic,Periodicals Topic: \& as data,Medical data,Plagiarism,Semantics,Vocabulary, numerical statistics
%N 2
%P 243--9
%R 10.1093/bioinformatics/btm574
%T Déjà vu--a study of duplicate citations in Medline.
%U http://www.ncbi.nlm.nih.gov/pubmed/18056062
%V 24
%X MOTIVATION: Duplicate publication impacts the quality of the scientific
corpus, has been difficult to detect, and studies this far have been
limited in scope and size. Using text similarity searches, we were
able to identify signatures of duplicate citations among a body of
abstracts. RESULTS: A sample of 62,213 Medline citations was examined
and a database of manually verified duplicate citations was created
to study author publication behavior. We found that 0.04\% of the
citations with no shared authors were highly similar and are thus
potential cases of plagiarism. 1.35\% with shared authors were sufficiently
similar to be considered a duplicate. Extrapolating, this would correspond
to 3500 and 117,500 duplicate citations in total, respectively. AVAILABILITY:
eTBLAST, an automated citation matching tool, and Déjà vu,
the duplicate citation database, are freely available at http://invention.swmed.edu/
and http://spore.swmed.edu/dejavu
@article{Errami2008,
abstract = {MOTIVATION: Duplicate publication impacts the quality of the scientific
corpus, has been difficult to detect, and studies this far have been
limited in scope and size. Using text similarity searches, we were
able to identify signatures of duplicate citations among a body of
abstracts. RESULTS: A sample of 62,213 Medline citations was examined
and a database of manually verified duplicate citations was created
to study author publication behavior. We found that 0.04\% of the
citations with no shared authors were highly similar and are thus
potential cases of plagiarism. 1.35\% with shared authors were sufficiently
similar to be considered a duplicate. Extrapolating, this would correspond
to 3500 and 117,500 duplicate citations in total, respectively. AVAILABILITY:
eTBLAST, an automated citation matching tool, and D\'{e}j\`{a} vu,
the duplicate citation database, are freely available at http://invention.swmed.edu/
and http://spore.swmed.edu/dejavu},
added-at = {2011-03-27T17:20:41.000+0200},
author = {Errami, Mounir and Hicks, Justin M and Fisher, Wayne and Trusty, David and Wren, Jonathan D and Long, Tara C and Garner, Harold R},
biburl = {https://www.bibsonomy.org/bibtex/24c14499ee1cf764ea04103c87cfccfe4/yevb0},
doi = {10.1093/bioinformatics/btm574},
interhash = {b6e1fcd7d2e4724312d7c11d92874779},
intrahash = {4c14499ee1cf764ea04103c87cfccfe4},
issn = {1367-4811},
journal = {Bioinformatics (Oxford, England)},
keywords = {Bibliometrics,MEDLINE,MEDLINE: Controlled Headings,Natural Language Processing,Periodicals Subject Topic,Periodicals Topic: \& as data,Medical data,Plagiarism,Semantics,Vocabulary, numerical statistics},
month = jan,
number = 2,
pages = {243--9},
pmid = {18056062},
timestamp = {2011-03-27T17:20:49.000+0200},
title = {D\'{e}j\`{a} vu--a study of duplicate citations in Medline.},
url = {http://www.ncbi.nlm.nih.gov/pubmed/18056062},
volume = 24,
year = 2008
}