Deep neural networks have been demonstrated to be vulnerable to backdoor
attacks. Specifically, by injecting a small number of maliciously constructed
inputs into the training set, an adversary is able to plant a backdoor into the
trained model. This backdoor can then be activated during inference by a
backdoor trigger to fully control the model's behavior. While such attacks are
very effective, they crucially rely on the adversary injecting arbitrary inputs
that are---often blatantly---mislabeled. Such samples would raise suspicion
upon human inspection, potentially revealing the attack. Thus, for backdoor
attacks to remain undetected, it is crucial that they maintain
label-consistency---the condition that injected inputs are consistent with
their labels. In this work, we leverage adversarial perturbations and
generative models to execute efficient, yet label-consistent, backdoor attacks.
Our approach is based on injecting inputs that appear plausible, yet are hard
to classify, hence causing the model to rely on the (easier-to-learn) backdoor
trigger.
%0 Journal Article
%1 turner2019labelconsistent
%A Turner, Alexander
%A Tsipras, Dimitris
%A Madry, Aleksander
%D 2019
%K adversarial
%T Label-Consistent Backdoor Attacks
%U http://arxiv.org/abs/1912.02771
%X Deep neural networks have been demonstrated to be vulnerable to backdoor
attacks. Specifically, by injecting a small number of maliciously constructed
inputs into the training set, an adversary is able to plant a backdoor into the
trained model. This backdoor can then be activated during inference by a
backdoor trigger to fully control the model's behavior. While such attacks are
very effective, they crucially rely on the adversary injecting arbitrary inputs
that are---often blatantly---mislabeled. Such samples would raise suspicion
upon human inspection, potentially revealing the attack. Thus, for backdoor
attacks to remain undetected, it is crucial that they maintain
label-consistency---the condition that injected inputs are consistent with
their labels. In this work, we leverage adversarial perturbations and
generative models to execute efficient, yet label-consistent, backdoor attacks.
Our approach is based on injecting inputs that appear plausible, yet are hard
to classify, hence causing the model to rely on the (easier-to-learn) backdoor
trigger.
@article{turner2019labelconsistent,
abstract = {Deep neural networks have been demonstrated to be vulnerable to backdoor
attacks. Specifically, by injecting a small number of maliciously constructed
inputs into the training set, an adversary is able to plant a backdoor into the
trained model. This backdoor can then be activated during inference by a
backdoor trigger to fully control the model's behavior. While such attacks are
very effective, they crucially rely on the adversary injecting arbitrary inputs
that are---often blatantly---mislabeled. Such samples would raise suspicion
upon human inspection, potentially revealing the attack. Thus, for backdoor
attacks to remain undetected, it is crucial that they maintain
label-consistency---the condition that injected inputs are consistent with
their labels. In this work, we leverage adversarial perturbations and
generative models to execute efficient, yet label-consistent, backdoor attacks.
Our approach is based on injecting inputs that appear plausible, yet are hard
to classify, hence causing the model to rely on the (easier-to-learn) backdoor
trigger.},
added-at = {2019-12-06T20:38:28.000+0100},
author = {Turner, Alexander and Tsipras, Dimitris and Madry, Aleksander},
biburl = {https://www.bibsonomy.org/bibtex/258afd7da0f14e05d71bdeb4fb4448b30/kirk86},
description = {[1912.02771] Label-Consistent Backdoor Attacks},
interhash = {5bdb50829789fa05016b1d8b370930b2},
intrahash = {58afd7da0f14e05d71bdeb4fb4448b30},
keywords = {adversarial},
note = {cite arxiv:1912.02771},
timestamp = {2019-12-06T20:38:28.000+0100},
title = {Label-Consistent Backdoor Attacks},
url = {http://arxiv.org/abs/1912.02771},
year = 2019
}