As decision-making increasingly relies on machine learning (ML) and (big) data, the issue of fairness in data-driven artificial intelligence systems is receiving increasing attention from both research and industry. A large variety of fairness-aware ML solutions have been proposed which involve fairness-related interventions in the data, learning algorithms, and/or model outputs. However, a vital part of proposing new approaches is evaluating them empirically on benchmark datasets that represent realistic and diverse settings. Therefore, in this paper, we overview real-world datasets used for fairness-aware ML. We focus on tabular data as the most common data representation for fairness-aware ML. We start our analysis by identifying relationships between the different attributes, particularly with respect to protected attributes and class attribute, using a Bayesian network. For a deeper understanding of bias in the datasets, we investigate interesting relationships using exploratory analysis.
%0 Journal Article
%1 Le_Quy_2022
%A Le Quy, Tai
%A Roy, Arjun
%A Iosifidis, Vasileios
%A Zhang, Wenbin
%A Ntoutsi, Eirini
%D 2022
%E Bertino, Elisa
%E Pedrycz, Witold
%I Wiley
%J WIREs Data Mining and Knowledge Discovery
%K l3s myown
%N 3
%R 10.1002/widm.1452
%T A survey on datasets for fairness-aware machine learning
%U https://doi.org/10.1002/widm.1452
%V 12
%X As decision-making increasingly relies on machine learning (ML) and (big) data, the issue of fairness in data-driven artificial intelligence systems is receiving increasing attention from both research and industry. A large variety of fairness-aware ML solutions have been proposed which involve fairness-related interventions in the data, learning algorithms, and/or model outputs. However, a vital part of proposing new approaches is evaluating them empirically on benchmark datasets that represent realistic and diverse settings. Therefore, in this paper, we overview real-world datasets used for fairness-aware ML. We focus on tabular data as the most common data representation for fairness-aware ML. We start our analysis by identifying relationships between the different attributes, particularly with respect to protected attributes and class attribute, using a Bayesian network. For a deeper understanding of bias in the datasets, we investigate interesting relationships using exploratory analysis.
@article{Le_Quy_2022,
abstract = {As decision-making increasingly relies on machine learning (ML) and (big) data, the issue of fairness in data-driven artificial intelligence systems is receiving increasing attention from both research and industry. A large variety of fairness-aware ML solutions have been proposed which involve fairness-related interventions in the data, learning algorithms, and/or model outputs. However, a vital part of proposing new approaches is evaluating them empirically on benchmark datasets that represent realistic and diverse settings. Therefore, in this paper, we overview real-world datasets used for fairness-aware ML. We focus on tabular data as the most common data representation for fairness-aware ML. We start our analysis by identifying relationships between the different attributes, particularly with respect to protected attributes and class attribute, using a Bayesian network. For a deeper understanding of bias in the datasets, we investigate interesting relationships using exploratory analysis.},
added-at = {2022-06-17T05:25:24.000+0200},
author = {Le Quy, Tai and Roy, Arjun and Iosifidis, Vasileios and Zhang, Wenbin and Ntoutsi, Eirini},
biburl = {https://www.bibsonomy.org/bibtex/2e710483ebdd786841653fcd972bafadd/quytai3985},
doi = {10.1002/widm.1452},
editor = {Bertino, Elisa and Pedrycz, Witold},
interhash = {1ac135d489af5df35f4e09d3543b8d3a},
intrahash = {e710483ebdd786841653fcd972bafadd},
issn = {1942-4795},
journal = {WIREs Data Mining and Knowledge Discovery},
keywords = {l3s myown},
month = mar,
number = 3,
publisher = {Wiley},
timestamp = {2022-06-17T05:29:40.000+0200},
title = {A survey on datasets for fairness-aware machine learning},
url = {https://doi.org/10.1002/widm.1452},
volume = 12,
year = 2022
}