As machine learning black boxes are increasingly being deployed in domains
such as healthcare and criminal justice, there is growing emphasis on building
tools and techniques for explaining these black boxes in an interpretable
manner. Such explanations are being leveraged by domain experts to diagnose
systematic errors and underlying biases of black boxes. In this paper, we
demonstrate that post hoc explanations techniques that rely on input
perturbations, such as LIME and SHAP, are not reliable. Specifically, we
propose a novel scaffolding technique that effectively hides the biases of any
given classifier by allowing an adversarial entity to craft an arbitrary
desired explanation. Our approach can be used to scaffold any biased classifier
in such a way that its predictions on the input data distribution still remain
biased, but the post hoc explanations of the scaffolded classifier look
innocuous. Using extensive evaluation with multiple real-world datasets
(including COMPAS), we demonstrate how extremely biased (racist) classifiers
crafted by our framework can easily fool popular explanation techniques such as
LIME and SHAP into generating innocuous explanations which do not reflect the
underlying biases.
Описание
[1911.02508] How can we fool LIME and SHAP? Adversarial Attacks on Post hoc Explanation Methods
%0 Generic
%1 slack2019adversarial
%A Slack, Dylan
%A Hilgard, Sophie
%A Jia, Emily
%A Singh, Sameer
%A Lakkaraju, Himabindu
%D 2019
%K interpretation lime machinelearning
%T How can we fool LIME and SHAP? Adversarial Attacks on Post hoc
Explanation Methods
%U http://arxiv.org/abs/1911.02508
%X As machine learning black boxes are increasingly being deployed in domains
such as healthcare and criminal justice, there is growing emphasis on building
tools and techniques for explaining these black boxes in an interpretable
manner. Such explanations are being leveraged by domain experts to diagnose
systematic errors and underlying biases of black boxes. In this paper, we
demonstrate that post hoc explanations techniques that rely on input
perturbations, such as LIME and SHAP, are not reliable. Specifically, we
propose a novel scaffolding technique that effectively hides the biases of any
given classifier by allowing an adversarial entity to craft an arbitrary
desired explanation. Our approach can be used to scaffold any biased classifier
in such a way that its predictions on the input data distribution still remain
biased, but the post hoc explanations of the scaffolded classifier look
innocuous. Using extensive evaluation with multiple real-world datasets
(including COMPAS), we demonstrate how extremely biased (racist) classifiers
crafted by our framework can easily fool popular explanation techniques such as
LIME and SHAP into generating innocuous explanations which do not reflect the
underlying biases.
@misc{slack2019adversarial,
abstract = {As machine learning black boxes are increasingly being deployed in domains
such as healthcare and criminal justice, there is growing emphasis on building
tools and techniques for explaining these black boxes in an interpretable
manner. Such explanations are being leveraged by domain experts to diagnose
systematic errors and underlying biases of black boxes. In this paper, we
demonstrate that post hoc explanations techniques that rely on input
perturbations, such as LIME and SHAP, are not reliable. Specifically, we
propose a novel scaffolding technique that effectively hides the biases of any
given classifier by allowing an adversarial entity to craft an arbitrary
desired explanation. Our approach can be used to scaffold any biased classifier
in such a way that its predictions on the input data distribution still remain
biased, but the post hoc explanations of the scaffolded classifier look
innocuous. Using extensive evaluation with multiple real-world datasets
(including COMPAS), we demonstrate how extremely biased (racist) classifiers
crafted by our framework can easily fool popular explanation techniques such as
LIME and SHAP into generating innocuous explanations which do not reflect the
underlying biases.},
added-at = {2019-11-08T17:26:32.000+0100},
author = {Slack, Dylan and Hilgard, Sophie and Jia, Emily and Singh, Sameer and Lakkaraju, Himabindu},
biburl = {https://www.bibsonomy.org/bibtex/2b776eb4d168a9d0a8b53b14aed23963e/cpankow},
description = {[1911.02508] How can we fool LIME and SHAP? Adversarial Attacks on Post hoc Explanation Methods},
interhash = {af73025c1f4d02d5388c466d5a1d99f0},
intrahash = {b776eb4d168a9d0a8b53b14aed23963e},
keywords = {interpretation lime machinelearning},
note = {cite arxiv:1911.02508},
timestamp = {2019-11-08T17:26:32.000+0100},
title = {How can we fool LIME and SHAP? Adversarial Attacks on Post hoc
Explanation Methods},
url = {http://arxiv.org/abs/1911.02508},
year = 2019
}