Deep neural networks can now perform many tasks that were once thought to be
only feasible for humans. Unfortunately, while reaching impressive performance
under standard settings, such networks are known to be susceptible to
adversarial attacks -- slight but carefully constructed perturbations of the
inputs which drastically decrease the network performance and reduce their
trustworthiness. Here we propose to improve network robustness to input
perturbations via an adversarial training procedure which we call Adversarial
Feature Desensitization (AFD). We augment the normal supervised training with
an adversarial game between the embedding network and an additional adversarial
decoder which is trained to discriminate between the clean and perturbed inputs
from their high-level embeddings. Our theoretical and empirical evidence
acknowledges the effectiveness of this approach in learning robust features on
MNIST, CIFAR10, and CIFAR100 datasets -- substantially improving the
state-of-the-art in robust classification against previously observed
adversarial attacks. More importantly, we demonstrate that AFD has better
generalization ability than previous methods, as the learned features maintain
their robustness against a large range of perturbations, including
perturbations not seen during training. These results indicate that reducing
feature sensitivity using adversarial training is a promising approach for
ameliorating the problem of adversarial attacks in deep neural networks.
%0 Journal Article
%1 bashivan2020adversarial
%A Bashivan, Pouya
%A Richards, Blake
%A Rish, Irina
%D 2020
%K adversarial deep-learning
%T Adversarial Feature Desensitization
%U http://arxiv.org/abs/2006.04621
%X Deep neural networks can now perform many tasks that were once thought to be
only feasible for humans. Unfortunately, while reaching impressive performance
under standard settings, such networks are known to be susceptible to
adversarial attacks -- slight but carefully constructed perturbations of the
inputs which drastically decrease the network performance and reduce their
trustworthiness. Here we propose to improve network robustness to input
perturbations via an adversarial training procedure which we call Adversarial
Feature Desensitization (AFD). We augment the normal supervised training with
an adversarial game between the embedding network and an additional adversarial
decoder which is trained to discriminate between the clean and perturbed inputs
from their high-level embeddings. Our theoretical and empirical evidence
acknowledges the effectiveness of this approach in learning robust features on
MNIST, CIFAR10, and CIFAR100 datasets -- substantially improving the
state-of-the-art in robust classification against previously observed
adversarial attacks. More importantly, we demonstrate that AFD has better
generalization ability than previous methods, as the learned features maintain
their robustness against a large range of perturbations, including
perturbations not seen during training. These results indicate that reducing
feature sensitivity using adversarial training is a promising approach for
ameliorating the problem of adversarial attacks in deep neural networks.
@article{bashivan2020adversarial,
abstract = {Deep neural networks can now perform many tasks that were once thought to be
only feasible for humans. Unfortunately, while reaching impressive performance
under standard settings, such networks are known to be susceptible to
adversarial attacks -- slight but carefully constructed perturbations of the
inputs which drastically decrease the network performance and reduce their
trustworthiness. Here we propose to improve network robustness to input
perturbations via an adversarial training procedure which we call Adversarial
Feature Desensitization (AFD). We augment the normal supervised training with
an adversarial game between the embedding network and an additional adversarial
decoder which is trained to discriminate between the clean and perturbed inputs
from their high-level embeddings. Our theoretical and empirical evidence
acknowledges the effectiveness of this approach in learning robust features on
MNIST, CIFAR10, and CIFAR100 datasets -- substantially improving the
state-of-the-art in robust classification against previously observed
adversarial attacks. More importantly, we demonstrate that AFD has better
generalization ability than previous methods, as the learned features maintain
their robustness against a large range of perturbations, including
perturbations not seen during training. These results indicate that reducing
feature sensitivity using adversarial training is a promising approach for
ameliorating the problem of adversarial attacks in deep neural networks.},
added-at = {2020-06-09T09:35:22.000+0200},
author = {Bashivan, Pouya and Richards, Blake and Rish, Irina},
biburl = {https://www.bibsonomy.org/bibtex/227d31e16f5610552defd4562b7f0a2ff/kirk86},
description = {[2006.04621] Adversarial Feature Desensitization},
interhash = {b811240a1b54a38d28ea7564691bd217},
intrahash = {27d31e16f5610552defd4562b7f0a2ff},
keywords = {adversarial deep-learning},
note = {cite arxiv:2006.04621Comment: submitted to Neurips 2020},
timestamp = {2020-06-09T09:35:22.000+0200},
title = {Adversarial Feature Desensitization},
url = {http://arxiv.org/abs/2006.04621},
year = 2020
}