Wikidata has been increasingly adopted by many communities for a wide variety
of applications, which demand high-quality knowledge to deliver successful
results. In this paper, we develop a framework to detect and analyze
low-quality statements in Wikidata by shedding light on the current practices
exercised by the community. We explore three indicators of data quality in
Wikidata, based on: 1) community consensus on the currently recorded knowledge,
assuming that statements that have been removed and not added back are
implicitly agreed to be of low quality; 2) statements that have been
deprecated; and 3) constraint violations in the data. We combine these
indicators to detect low-quality statements, revealing challenges with
duplicate entities, missing triples, violated type rules, and taxonomic
distinctions. Our findings complement ongoing efforts by the Wikidata community
to improve data quality, aiming to make it easier for users and editors to find
and correct mistakes.
%0 Generic
%1 shenoy2021study
%A Shenoy, Kartik
%A Ilievski, Filip
%A Garijo, Daniel
%A Schwabe, Daniel
%A Szekely, Pedro
%D 2021
%K data dataset plk quality research wikidata wikipedia
%T A Study of the Quality of Wikidata
%U http://arxiv.org/abs/2107.00156
%X Wikidata has been increasingly adopted by many communities for a wide variety
of applications, which demand high-quality knowledge to deliver successful
results. In this paper, we develop a framework to detect and analyze
low-quality statements in Wikidata by shedding light on the current practices
exercised by the community. We explore three indicators of data quality in
Wikidata, based on: 1) community consensus on the currently recorded knowledge,
assuming that statements that have been removed and not added back are
implicitly agreed to be of low quality; 2) statements that have been
deprecated; and 3) constraint violations in the data. We combine these
indicators to detect low-quality statements, revealing challenges with
duplicate entities, missing triples, violated type rules, and taxonomic
distinctions. Our findings complement ongoing efforts by the Wikidata community
to improve data quality, aiming to make it easier for users and editors to find
and correct mistakes.
@misc{shenoy2021study,
abstract = {Wikidata has been increasingly adopted by many communities for a wide variety
of applications, which demand high-quality knowledge to deliver successful
results. In this paper, we develop a framework to detect and analyze
low-quality statements in Wikidata by shedding light on the current practices
exercised by the community. We explore three indicators of data quality in
Wikidata, based on: 1) community consensus on the currently recorded knowledge,
assuming that statements that have been removed and not added back are
implicitly agreed to be of low quality; 2) statements that have been
deprecated; and 3) constraint violations in the data. We combine these
indicators to detect low-quality statements, revealing challenges with
duplicate entities, missing triples, violated type rules, and taxonomic
distinctions. Our findings complement ongoing efforts by the Wikidata community
to improve data quality, aiming to make it easier for users and editors to find
and correct mistakes.},
added-at = {2021-07-15T10:19:11.000+0200},
author = {Shenoy, Kartik and Ilievski, Filip and Garijo, Daniel and Schwabe, Daniel and Szekely, Pedro},
biburl = {https://www.bibsonomy.org/bibtex/2ab7a2ccf0adb549f076a1d1a7bf69a76/jaeschke},
description = {[2107.00156v1] A Study of the Quality of Wikidata},
interhash = {04c55e52b803b8f5eb88f7bda5c17e7a},
intrahash = {ab7a2ccf0adb549f076a1d1a7bf69a76},
keywords = {data dataset plk quality research wikidata wikipedia},
note = {cite arxiv:2107.00156},
timestamp = {2021-09-28T15:11:16.000+0200},
title = {A Study of the Quality of Wikidata},
url = {http://arxiv.org/abs/2107.00156},
year = 2021
}