├── detect
    ├── __init__.py
    ├── attacks.py
    └── util.py
├── .gitignore
├── data
    └── README.md
├── requirements.txt
├── scripts
    ├── train_model.py
    ├── craft_adv_samples.py
    └── detect_adv_samples.py
└── README.md


/detect/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | This is an empty folder where models and adversarial samples will be saved.
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | tqdm
4 | sklearn
5 | matplotlib
6 | tensorflow >= 1.4
7 | Keras >= 2.1
8 | cleverhans >= 2.0
9 | 


--------------------------------------------------------------------------------
/scripts/train_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, absolute_import, print_function
 2 | 
 3 | import argparse
 4 | 
 5 | from detect.util import get_data, get_model
 6 | 
 7 | 
 8 | def main(args):
 9 |     assert args.dataset in ['mnist', 'cifar', 'svhn'], \
10 |         "dataset parameter must be either 'mnist', 'cifar' or 'svhn'"
11 |     print('Data set: %s' % args.dataset)
12 |     X_train, Y_train, X_test, Y_test = get_data(args.dataset)
13 |     model = get_model(args.dataset)
14 |     model.compile(
15 |         loss='categorical_crossentropy',
16 |         optimizer='adadelta',
17 |         metrics=['accuracy']
18 |     )
19 |     model.fit(
20 |         X_train, Y_train,
21 |         epochs=args.epochs,
22 |         batch_size=args.batch_size,
23 |         shuffle=True,
24 |         verbose=1,
25 |         validation_data=(X_test, Y_test)
26 |     )
27 |     model.save('../data/model_%s.h5' % args.dataset)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument(
33 |         '-d', '--dataset',
34 |         help="Dataset to use; either 'mnist', 'cifar' or 'svhn'",
35 |         required=True, type=str
36 |     )
37 |     parser.add_argument(
38 |         '-e', '--epochs',
39 |         help="The number of epochs to train for.",
40 |         required=False, type=int
41 |     )
42 |     parser.add_argument(
43 |         '-b', '--batch_size',
44 |         help="The batch size to use for training.",
45 |         required=False, type=int
46 |     )
47 |     parser.set_defaults(epochs=20)
48 |     parser.set_defaults(batch_size=128)
49 |     args = parser.parse_args()
50 |     main(args)
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Detecting Adversarial Samples from Artifacts
 2 | This repository contains the code for the paper [Detecting
 3 | Adversarial Samples from Artifacts](https://arxiv.org/abs/1703.00410)
 4 | (Feinman et al., 2017).
 5 | 
 6 | ## Requirements & Setup
 7 | This code repository requires Keras > 2.0 and TensorFlow. Keras must be
 8 | configured to use TensorFlow backend. A full list of requirements can be found
 9 | in `requirements.txt`. To install, run the following command to clone the
10 | repository into a folder of your choice:
11 | 
12 |     git clone https://github.com/rfeinman/detecting-adversarial-samples.git
13 | 
14 | On UNIX machines, after cloning this repository, it is
15 | recommended that you add the path to the repository to your `PYTHONPATH`
16 | environment variable to enable imports from any folder:
17 | 
18 |     export PYTHONPATH="/path/to/detecting-adversarial-samples:$PYTHONPATH"
19 | 
20 | 
21 | ## Code Structure
22 | The source code is located in the detect/ subfolder, and scripts that users will
23 | run to perform various steps are located in the scripts/ subfolder. An empty
24 | subfolder, data/, is included for storing trained models and adversarial sample
25 | arrays. Instructions for running the code are below.
26 | 
27 | ## Running the Code
28 | All of the scripts for running the various parts of the code are located
29 | in the scripts/ subfolder.
30 | 
31 | ### 1. Train a new model
32 | To train a new model for a particular data set, simply run
33 | 
34 |     python train_model.py -d=<dataset> -e=<nb_epochs>
35 | 
36 | where `<dataset>` is one of either 'mnist,' 'cifar' or 'svhn,' and `<nb_epochs>`
37 | is an integer indicating the number of epochs to train for. We recommend using
38 | 10 epochs for MNIST, and 60 for each of CIFAR and SVHN. For example, to train
39 | the MNIST model for 10 epochs, we would run
40 | 
41 |     python train_model.py -d=mnist -e=10
42 | 
43 | The model will be trained and saved into the data/ subfolder and named
44 | `model_<dataset>.h5`. An optional batch size parameter is also available,
45 | specified with `-b=<batch_size>`. The default training batch size is 128.
46 | 
47 | ### 2. Craft adversarial samples
48 | To craft adversarial samples for a particular data set, you must first
49 | train the model for that data set (details above). Then, simply run
50 | 
51 |     python craft_adv_samples.py -d=<dataset> -a=<attack>
52 | 
53 | where `<dataset>` is the same as above and `<attack>` is one of either 'fgsm,'
54 | 'jsma,' 'bim-a,' 'bim-b' or 'all,' indicating which method to use to craft
55 | adversarial samples. For example, to craft adversarial samples for the
56 | MNIST model using FGSM, we would run
57 | 
58 |     python craft_adv_samples.py -d=mnist -a=fgsm
59 | 
60 | If 'all' is chosen (the default), all types of adversarial samples will be
61 | generated. Arrays holding the adversarial samples are stored in the data/
62 | subfolder and named `Adv_<dataset>_<attack>.npy`. An optional batch size
63 | parameter for evaluating adversarial samples is again provided
64 | (`-b=<batch_size>`). The default is 256.
65 | 
66 | ### 3. Detect adversarial samples
67 | To run the detection script, you must first train the model and craft
68 | adversarial samples for each data set you would like to use (details above).
69 | Then, simply run
70 | 
71 |     python detect_adv_samples.py -d=<dataset> -a=<attack>
72 | 
73 | where `<dataset>` and `<attack>` are the same as described above. An optional
74 | batch size parameter is again provided (`-b=<batch_size>`). For all of the
75 | adversarial samples provided, an equal number of noisy samples will be generated
76 | and included alongside the original samples as part of the 'negative' class
77 | for the detector. The perturbation size of these noisy samples is determined
78 | based on the average L2 perturbation size of the adversarial samples. Then,
79 | the Bayesian uncertainty and kernel density features will be computed for each
80 | of the normal, noisy and adversarial samples. A logistic regression model is
81 | trained on these features and the detector is built.
82 | 
83 | ## MNIST Demonstration
84 | Here, a simple demonstration is provided of the commands issued to run the full
85 | experiment with MNIST, using the FGSM attack. The following commands are used
86 | to run all 3 steps:
87 | 
88 |     1. python train_model.py -d=mnist -e=10
89 |     2. python craft_adv_samples.py -d=mnist -a=fgsm
90 |     3. python detect_adv_samples.py -d=mnist -a=fgsm
91 | 


--------------------------------------------------------------------------------
/scripts/craft_adv_samples.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, absolute_import, print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | import keras.backend as K
  8 | from keras.models import load_model
  9 | 
 10 | from detect.util import get_data
 11 | from detect.attacks import (fast_gradient_sign_method, basic_iterative_method,
 12 |                             saliency_map_method)
 13 | 
 14 | # FGSM & BIM attack parameters that were chosen
 15 | ATTACK_PARAMS = {
 16 |     'mnist': {'eps': 0.300, 'eps_iter': 0.010},
 17 |     'cifar': {'eps': 0.050, 'eps_iter': 0.005},
 18 |     'svhn': {'eps': 0.130, 'eps_iter': 0.010}
 19 | }
 20 | 
 21 | 
 22 | def craft_one_type(sess, model, X, Y, dataset, attack, batch_size):
 23 |     """
 24 |     TODO
 25 |     :param sess:
 26 |     :param model:
 27 |     :param X:
 28 |     :param Y:
 29 |     :param dataset:
 30 |     :param attack:
 31 |     :param batch_size:
 32 |     :return:
 33 |     """
 34 |     if attack == 'fgsm':
 35 |         # FGSM attack
 36 |         print('Crafting fgsm adversarial samples...')
 37 |         X_adv = fast_gradient_sign_method(
 38 |             sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], clip_min=0.,
 39 |             clip_max=1., batch_size=batch_size
 40 |         )
 41 |     elif attack in ['bim-a', 'bim-b']:
 42 |         # BIM attack
 43 |         print('Crafting %s adversarial samples...' % attack)
 44 |         its, results = basic_iterative_method(
 45 |             sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'],
 46 |             eps_iter=ATTACK_PARAMS[dataset]['eps_iter'], clip_min=0.,
 47 |             clip_max=1., batch_size=batch_size
 48 |         )
 49 |         if attack == 'bim-a':
 50 |             # BIM-A
 51 |             # For each sample, select the time step where that sample first
 52 |             # became misclassified
 53 |             X_adv = np.asarray([results[its[i], i] for i in range(len(Y))])
 54 |         else:
 55 |             # BIM-B
 56 |             # For each sample, select the very last time step
 57 |             X_adv = results[-1]
 58 |     elif attack == 'jsma':
 59 |         # JSMA attack
 60 |         print('Crafting jsma adversarial samples. This may take a while...')
 61 |         X_adv = saliency_map_method(
 62 |             sess, model, X, Y, theta=1, gamma=0.1, clip_min=0., clip_max=1.
 63 |         )
 64 |     else:
 65 |         # TODO: CW attack
 66 |         raise NotImplementedError('CW attack not yet implemented.')
 67 |     _, acc = model.evaluate(X_adv, Y, batch_size=batch_size,
 68 |                             verbose=0)
 69 |     print("Model accuracy on the adversarial test set: %0.2f%%" % (100 * acc))
 70 |     np.save('../data/Adv_%s_%s.npy' % (args.dataset, args.attack), X_adv)
 71 | 
 72 | 
 73 | def main(args):
 74 |     assert args.dataset in ['mnist', 'cifar', 'svhn'], \
 75 |         "Dataset parameter must be either 'mnist', 'cifar' or 'svhn'"
 76 |     assert args.attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw', 'all'], \
 77 |         "Attack parameter must be either 'fgsm', 'bim-a', 'bim-b', " \
 78 |         "'jsma' or 'cw'"
 79 |     assert os.path.isfile('../data/model_%s.h5' % args.dataset), \
 80 |         'model file not found... must first train model using train_model.py.'
 81 |     print('Dataset: %s. Attack: %s' % (args.dataset, args.attack))
 82 |     # Create TF session, set it as Keras backend
 83 |     sess = tf.Session()
 84 |     K.set_session(sess)
 85 |     K.set_learning_phase(0)
 86 |     model = load_model('../data/model_%s.h5' % args.dataset)
 87 |     _, _, X_test, Y_test = get_data(args.dataset)
 88 |     _, acc = model.evaluate(X_test, Y_test, batch_size=args.batch_size,
 89 |                             verbose=0)
 90 |     print("Accuracy on the test set: %0.2f%%" % (100*acc))
 91 |     if args.attack == 'all':
 92 |         # Cycle through all attacks
 93 |         for attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw']:
 94 |             craft_one_type(sess, model, X_test, Y_test, args.dataset, attack,
 95 |                            args.batch_size)
 96 |     else:
 97 |         # Craft one specific attack type
 98 |         craft_one_type(sess, model, X_test, Y_test, args.dataset, args.attack,
 99 |                        args.batch_size)
100 |     print('Adversarial samples crafted and saved to data/ subfolder.')
101 |     sess.close()
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument(
107 |         '-d', '--dataset',
108 |         help="Dataset to use; either 'mnist', 'cifar' or 'svhn'",
109 |         required=True, type=str
110 |     )
111 |     parser.add_argument(
112 |         '-a', '--attack',
113 |         help="Attack to use; either 'fgsm', 'bim-a', 'bim-b', 'jsma', 'cw' "
114 |              "or 'all'",
115 |         required=True, type=str
116 |     )
117 |     parser.add_argument(
118 |         '-b', '--batch_size',
119 |         help="The batch size to use for training.",
120 |         required=False, type=int
121 |     )
122 |     parser.set_defaults(batch_size=256)
123 |     args = parser.parse_args()
124 |     main(args)
125 | 


--------------------------------------------------------------------------------
/detect/attacks.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, absolute_import, print_function
  2 | 
  3 | from collections import defaultdict
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tqdm import tqdm
  7 | from cleverhans.utils import other_classes
  8 | from cleverhans.utils_tf import batch_eval, model_argmax
  9 | from cleverhans.attacks import SaliencyMapMethod
 10 | 
 11 | 
 12 | def fgsm(x, predictions, eps, clip_min=None, clip_max=None, y=None):
 13 |     """
 14 |     Computes symbolic TF tensor for the adversarial samples. This must
 15 |     be evaluated with a session.run call.
 16 |     :param x: the input placeholder
 17 |     :param predictions: the model's output tensor
 18 |     :param eps: the epsilon (input variation parameter)
 19 |     :param clip_min: optional parameter that can be used to set a minimum
 20 |                     value for components of the example returned
 21 |     :param clip_max: optional parameter that can be used to set a maximum
 22 |                     value for components of the example returned
 23 |     :param y: the output placeholder. Use None (the default) to avoid the
 24 |             label leaking effect.
 25 |     :return: a tensor for the adversarial example
 26 |     """
 27 | 
 28 |     # Compute loss
 29 |     if y is None:
 30 |         # In this case, use model predictions as ground truth
 31 |         y = tf.to_float(
 32 |             tf.equal(predictions,
 33 |                      tf.reduce_max(predictions, 1, keep_dims=True)))
 34 |     y = y / tf.reduce_sum(y, 1, keep_dims=True)
 35 |     logits, = predictions.op.inputs
 36 |     loss = tf.reduce_mean(
 37 |         tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)
 38 |     )
 39 | 
 40 |     # Define gradient of loss wrt input
 41 |     grad, = tf.gradients(loss, x)
 42 | 
 43 |     # Take sign of gradient
 44 |     signed_grad = tf.sign(grad)
 45 | 
 46 |     # Multiply by constant epsilon
 47 |     scaled_signed_grad = eps * signed_grad
 48 | 
 49 |     # Add perturbation to original example to obtain adversarial example
 50 |     adv_x = tf.stop_gradient(x + scaled_signed_grad)
 51 | 
 52 |     # If clipping is needed, reset all values outside of [clip_min, clip_max]
 53 |     if (clip_min is not None) and (clip_max is not None):
 54 |         adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)
 55 | 
 56 |     return adv_x
 57 | 
 58 | def fast_gradient_sign_method(sess, model, X, Y, eps, clip_min=None,
 59 |                               clip_max=None, batch_size=256):
 60 |     """
 61 |     TODO
 62 |     :param sess:
 63 |     :param model:
 64 |     :param X:
 65 |     :param Y:
 66 |     :param eps:
 67 |     :param clip_min:
 68 |     :param clip_max:
 69 |     :param batch_size:
 70 |     :return:
 71 |     """
 72 |     # Define TF placeholders for the input and output
 73 |     x = tf.placeholder(tf.float32, shape=(None,) + X.shape[1:])
 74 |     y = tf.placeholder(tf.float32, shape=(None,) + Y.shape[1:])
 75 |     adv_x = fgsm(
 76 |         x, model(x), eps=eps,
 77 |         clip_min=clip_min,
 78 |         clip_max=clip_max, y=y
 79 |     )
 80 |     X_adv, = batch_eval(
 81 |         sess, [x, y], [adv_x],
 82 |         [X, Y], args={'batch_size': batch_size}
 83 |     )
 84 | 
 85 |     return X_adv
 86 | 
 87 | def basic_iterative_method(sess, model, X, Y, eps, eps_iter, nb_iter=50,
 88 |                            clip_min=None, clip_max=None, batch_size=256):
 89 |     """
 90 |     TODO
 91 |     :param sess:
 92 |     :param model:
 93 |     :param X:
 94 |     :param Y:
 95 |     :param eps:
 96 |     :param eps_iter:
 97 |     :param nb_iter:
 98 |     :param clip_min:
 99 |     :param clip_max:
100 |     :param batch_size:
101 |     :return:
102 |     """
103 |     # Define TF placeholders for the input and output
104 |     x = tf.placeholder(tf.float32, shape=(None,)+X.shape[1:])
105 |     y = tf.placeholder(tf.float32, shape=(None,)+Y.shape[1:])
106 |     # results will hold the adversarial inputs at each iteration of BIM;
107 |     # thus it will have shape (nb_iter, n_samples, n_rows, n_cols, n_channels)
108 |     results = np.zeros((nb_iter, X.shape[0],) + X.shape[1:])
109 |     # Initialize adversarial samples as the original samples, set upper and
110 |     # lower bounds
111 |     X_adv = X
112 |     X_min = X_adv - eps
113 |     X_max = X_adv + eps
114 |     print('Running BIM iterations...')
115 |     # "its" is a dictionary that keeps track of the iteration at which each
116 |     # sample becomes misclassified. The default value will be (nb_iter-1), the
117 |     # very last iteration.
118 |     def f(val):
119 |         return lambda: val
120 |     its = defaultdict(f(nb_iter-1))
121 |     # Out keeps track of which samples have already been misclassified
122 |     out = set()
123 |     for i in tqdm(range(nb_iter)):
124 |         adv_x = fgsm(
125 |             x, model(x), eps=eps_iter,
126 |             clip_min=clip_min, clip_max=clip_max, y=y
127 |         )
128 |         X_adv, = batch_eval(
129 |             sess, [x, y], [adv_x],
130 |             [X_adv, Y], args={'batch_size': batch_size}
131 |         )
132 |         X_adv = np.maximum(np.minimum(X_adv, X_max), X_min)
133 |         results[i] = X_adv
134 |         # check misclassifieds
135 |         predictions = model.predict_classes(X_adv, batch_size=512, verbose=0)
136 |         misclassifieds = np.where(predictions != Y.argmax(axis=1))[0]
137 |         for elt in misclassifieds:
138 |             if elt not in out:
139 |                 its[elt] = i
140 |                 out.add(elt)
141 | 
142 |     return its, results
143 | 
144 | def saliency_map_method(sess, model, X, Y, theta, gamma, clip_min=None,
145 |                         clip_max=None):
146 |     """
147 | 
148 |     :param sess:
149 |     :param model:
150 |     :param X:
151 |     :param Y:
152 |     :param theta:
153 |     :param gamma:
154 |     :param clip_min:
155 |     :param clip_max:
156 |     :return:
157 |     """
158 |     nb_classes = Y.shape[1]
159 |     X_adv = np.zeros_like(X)
160 |     # Instantiate a SaliencyMapMethod attack object
161 |     jsma = SaliencyMapMethod(model, back='tf', sess=sess)
162 |     jsma_params = {'theta': theta, 'gamma': gamma,
163 |                    'clip_min': clip_min, 'clip_max': clip_max,
164 |                    'y_target': None}
165 |     for i in tqdm(range(len(X))):
166 |         # Get the sample
167 |         sample = X[i:(i+1)]
168 |         # First, record the current class of the sample
169 |         current_class = int(np.argmax(Y[i]))
170 |         # Randomly choose a target class
171 |         target_class = np.random.choice(other_classes(nb_classes,
172 |                                                       current_class))
173 |         # This call runs the Jacobian-based saliency map approach
174 |         one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
175 |         one_hot_target[0, target_class] = 1
176 |         jsma_params['y_target'] = one_hot_target
177 |         X_adv[i] = jsma.generate_np(sample, **jsma_params)
178 | 
179 |     return X_adv


--------------------------------------------------------------------------------
/scripts/detect_adv_samples.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, absolute_import, print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import warnings
  6 | import numpy as np
  7 | from sklearn.neighbors import KernelDensity
  8 | from keras.models import load_model
  9 | 
 10 | from detect.util import (get_data, get_noisy_samples, get_mc_predictions,
 11 |                          get_deep_representations, score_samples, normalize,
 12 |                          train_lr, compute_roc)
 13 | 
 14 | # Optimal KDE bandwidths that were determined from CV tuning
 15 | BANDWIDTHS = {'mnist': 1.20, 'cifar': 0.26, 'svhn': 1.00}
 16 | 
 17 | 
 18 | def main(args):
 19 |     assert args.dataset in ['mnist', 'cifar', 'svhn'], \
 20 |         "Dataset parameter must be either 'mnist', 'cifar' or 'svhn'"
 21 |     assert args.attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw', 'all'], \
 22 |         "Attack parameter must be either 'fgsm', 'bim-a', 'bim-b', " \
 23 |         "'jsma' or 'cw'"
 24 |     assert os.path.isfile('../data/model_%s.h5' % args.dataset), \
 25 |         'model file not found... must first train model using train_model.py.'
 26 |     assert os.path.isfile('../data/Adv_%s_%s.npy' %
 27 |                           (args.dataset, args.attack)), \
 28 |         'adversarial sample file not found... must first craft adversarial ' \
 29 |         'samples using craft_adv_samples.py'
 30 |     print('Loading the data and model...')
 31 |     # Load the model
 32 |     model = load_model('../data/model_%s.h5' % args.dataset)
 33 |     # Load the dataset
 34 |     X_train, Y_train, X_test, Y_test = get_data(args.dataset)
 35 |     # Check attack type, select adversarial and noisy samples accordingly
 36 |     print('Loading noisy and adversarial samples...')
 37 |     if args.attack == 'all':
 38 |         # TODO: implement 'all' option
 39 |         #X_test_adv = ...
 40 |         #X_test_noisy = ...
 41 |         raise NotImplementedError("'All' types detector not yet implemented.")
 42 |     else:
 43 |         # Load adversarial samples
 44 |         X_test_adv = np.load('../data/Adv_%s_%s.npy' % (args.dataset,
 45 |                                                         args.attack))
 46 |         # Craft an equal number of noisy samples
 47 |         X_test_noisy = get_noisy_samples(X_test, X_test_adv, args.dataset,
 48 |                                          args.attack)
 49 |     # Check model accuracies on each sample type
 50 |     for s_type, dataset in zip(['normal', 'noisy', 'adversarial'],
 51 |                                [X_test, X_test_noisy, X_test_adv]):
 52 |         _, acc = model.evaluate(dataset, Y_test, batch_size=args.batch_size,
 53 |                                 verbose=0)
 54 |         print("Model accuracy on the %s test set: %0.2f%%" %
 55 |               (s_type, 100 * acc))
 56 |         # Compute and display average perturbation sizes
 57 |         if not s_type == 'normal':
 58 |             l2_diff = np.linalg.norm(
 59 |                 dataset.reshape((len(X_test), -1)) -
 60 |                 X_test.reshape((len(X_test), -1)),
 61 |                 axis=1
 62 |             ).mean()
 63 |             print("Average L-2 perturbation size of the %s test set: %0.2f" %
 64 |                   (s_type, l2_diff))
 65 |     # Refine the normal, noisy and adversarial sets to only include samples for
 66 |     # which the original version was correctly classified by the model
 67 |     preds_test = model.predict_classes(X_test, verbose=0,
 68 |                                        batch_size=args.batch_size)
 69 |     inds_correct = np.where(preds_test == Y_test.argmax(axis=1))[0]
 70 |     X_test = X_test[inds_correct]
 71 |     X_test_noisy = X_test_noisy[inds_correct]
 72 |     X_test_adv = X_test_adv[inds_correct]
 73 | 
 74 |     ## Get Bayesian uncertainty scores
 75 |     print('Getting Monte Carlo dropout variance predictions...')
 76 |     uncerts_normal = get_mc_predictions(model, X_test,
 77 |                                         batch_size=args.batch_size) \
 78 |         .var(axis=0).mean(axis=1)
 79 |     uncerts_noisy = get_mc_predictions(model, X_test_noisy,
 80 |                                        batch_size=args.batch_size) \
 81 |         .var(axis=0).mean(axis=1)
 82 |     uncerts_adv = get_mc_predictions(model, X_test_adv,
 83 |                                      batch_size=args.batch_size) \
 84 |         .var(axis=0).mean(axis=1)
 85 | 
 86 |     ## Get KDE scores
 87 |     # Get deep feature representations
 88 |     print('Getting deep feature representations...')
 89 |     X_train_features = get_deep_representations(model, X_train,
 90 |                                                 batch_size=args.batch_size)
 91 |     X_test_normal_features = get_deep_representations(model, X_test,
 92 |                                                       batch_size=args.batch_size)
 93 |     X_test_noisy_features = get_deep_representations(model, X_test_noisy,
 94 |                                                      batch_size=args.batch_size)
 95 |     X_test_adv_features = get_deep_representations(model, X_test_adv,
 96 |                                                    batch_size=args.batch_size)
 97 |     # Train one KDE per class
 98 |     print('Training KDEs...')
 99 |     class_inds = {}
100 |     for i in range(Y_train.shape[1]):
101 |         class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0]
102 |     kdes = {}
103 |     warnings.warn("Using pre-set kernel bandwidths that were determined "
104 |                   "optimal for the specific CNN models of the paper. If you've "
105 |                   "changed your model, you'll need to re-optimize the "
106 |                   "bandwidth.")
107 |     for i in range(Y_train.shape[1]):
108 |         kdes[i] = KernelDensity(kernel='gaussian',
109 |                                 bandwidth=BANDWIDTHS[args.dataset]) \
110 |             .fit(X_train_features[class_inds[i]])
111 |     # Get model predictions
112 |     print('Computing model predictions...')
113 |     preds_test_normal = model.predict_classes(X_test, verbose=0,
114 |                                               batch_size=args.batch_size)
115 |     preds_test_noisy = model.predict_classes(X_test_noisy, verbose=0,
116 |                                              batch_size=args.batch_size)
117 |     preds_test_adv = model.predict_classes(X_test_adv, verbose=0,
118 |                                            batch_size=args.batch_size)
119 |     # Get density estimates
120 |     print('computing densities...')
121 |     densities_normal = score_samples(
122 |         kdes,
123 |         X_test_normal_features,
124 |         preds_test_normal
125 |     )
126 |     densities_noisy = score_samples(
127 |         kdes,
128 |         X_test_noisy_features,
129 |         preds_test_noisy
130 |     )
131 |     densities_adv = score_samples(
132 |         kdes,
133 |         X_test_adv_features,
134 |         preds_test_adv
135 |     )
136 | 
137 |     ## Z-score the uncertainty and density values
138 |     uncerts_normal_z, uncerts_adv_z, uncerts_noisy_z = normalize(
139 |         uncerts_normal,
140 |         uncerts_adv,
141 |         uncerts_noisy
142 |     )
143 |     densities_normal_z, densities_adv_z, densities_noisy_z = normalize(
144 |         densities_normal,
145 |         densities_adv,
146 |         densities_noisy
147 |     )
148 | 
149 |     ## Build detector
150 |     values, labels, lr = train_lr(
151 |         densities_pos=densities_adv_z,
152 |         densities_neg=np.concatenate((densities_normal_z, densities_noisy_z)),
153 |         uncerts_pos=uncerts_adv_z,
154 |         uncerts_neg=np.concatenate((uncerts_normal_z, uncerts_noisy_z))
155 |     )
156 | 
157 |     ## Evaluate detector
158 |     # Compute logistic regression model predictions
159 |     probs = lr.predict_proba(values)[:, 1]
160 |     # Compute AUC
161 |     n_samples = len(X_test)
162 |     # The first 2/3 of 'probs' is the negative class (normal and noisy samples),
163 |     # and the last 1/3 is the positive class (adversarial samples).
164 |     _, _, auc_score = compute_roc(
165 |         probs_neg=probs[:2 * n_samples],
166 |         probs_pos=probs[2 * n_samples:]
167 |     )
168 |     print('Detector ROC-AUC score: %0.4f' % auc_score)
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     parser = argparse.ArgumentParser()
173 |     parser.add_argument(
174 |         '-d', '--dataset',
175 |         help="Dataset to use; either 'mnist', 'cifar' or 'svhn'",
176 |         required=True, type=str
177 |     )
178 |     parser.add_argument(
179 |         '-a', '--attack',
180 |         help="Attack to use; either 'fgsm', 'bim-a', 'bim-b', 'jsma' 'cw' "
181 |              "or 'all'",
182 |         required=True, type=str
183 |     )
184 |     parser.add_argument(
185 |         '-b', '--batch_size',
186 |         help="The batch size to use for training.",
187 |         required=False, type=int
188 |     )
189 |     parser.set_defaults(batch_size=256)
190 |     args = parser.parse_args()
191 |     main(args)
192 | 


--------------------------------------------------------------------------------
/detect/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, absolute_import, print_function
  2 | 
  3 | import os
  4 | import multiprocessing as mp
  5 | from subprocess import call
  6 | import warnings
  7 | import numpy as np
  8 | import scipy.io as sio
  9 | from tqdm import tqdm
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.metrics import roc_curve, auc
 12 | from sklearn.linear_model import LogisticRegressionCV
 13 | from sklearn.preprocessing import scale
 14 | import keras.backend as K
 15 | from keras.datasets import mnist, cifar10
 16 | from keras.utils import np_utils
 17 | from keras.models import Sequential
 18 | from keras.layers import Dense, Dropout, Activation, Flatten
 19 | from keras.layers import Conv2D, MaxPooling2D
 20 | from keras.regularizers import l2
 21 | 
 22 | # Gaussian noise scale sizes that were determined so that the average
 23 | # L-2 perturbation size is equal to that of the adversarial samples
 24 | STDEVS = {
 25 |     'mnist': {'fgsm': 0.310, 'bim-a': 0.128, 'bim-b': 0.265},
 26 |     'cifar': {'fgsm': 0.050, 'bim-a': 0.009, 'bim-b': 0.039},
 27 |     'svhn': {'fgsm': 0.132, 'bim-a': 0.015, 'bim-b': 0.122}
 28 | }
 29 | # Set random seed
 30 | np.random.seed(0)
 31 | 
 32 | 
 33 | def get_data(dataset='mnist'):
 34 |     """
 35 |     TODO
 36 |     :param dataset:
 37 |     :return:
 38 |     """
 39 |     assert dataset in ['mnist', 'cifar', 'svhn'], \
 40 |         "dataset parameter must be either 'mnist' 'cifar' or 'svhn'"
 41 |     if dataset == 'mnist':
 42 |         # the data, shuffled and split between train and test sets
 43 |         (X_train, y_train), (X_test, y_test) = mnist.load_data()
 44 |         # reshape to (n_samples, 28, 28, 1)
 45 |         X_train = X_train.reshape(-1, 28, 28, 1)
 46 |         X_test = X_test.reshape(-1, 28, 28, 1)
 47 |     elif dataset == 'cifar':
 48 |         # the data, shuffled and split between train and test sets
 49 |         (X_train, y_train), (X_test, y_test) = cifar10.load_data()
 50 |     else:
 51 |         if not os.path.isfile("../data/svhn_train.mat"):
 52 |             print('Downloading SVHN train set...')
 53 |             call(
 54 |                 "curl -o ../data/svhn_train.mat "
 55 |                 "http://ufldl.stanford.edu/housenumbers/train_32x32.mat",
 56 |                 shell=True
 57 |             )
 58 |         if not os.path.isfile("../data/svhn_test.mat"):
 59 |             print('Downloading SVHN test set...')
 60 |             call(
 61 |                 "curl -o ../data/svhn_test.mat "
 62 |                 "http://ufldl.stanford.edu/housenumbers/test_32x32.mat",
 63 |                 shell=True
 64 |             )
 65 |         train = sio.loadmat('../data/svhn_train.mat')
 66 |         test = sio.loadmat('../data/svhn_test.mat')
 67 |         X_train = np.transpose(train['X'], axes=[3, 0, 1, 2])
 68 |         X_test = np.transpose(test['X'], axes=[3, 0, 1, 2])
 69 |         # reshape (n_samples, 1) to (n_samples,) and change 1-index
 70 |         # to 0-index
 71 |         y_train = np.reshape(train['y'], (-1,)) - 1
 72 |         y_test = np.reshape(test['y'], (-1,)) - 1
 73 | 
 74 |     # cast pixels to floats, normalize to [0, 1] range
 75 |     X_train = X_train.astype('float32')
 76 |     X_test = X_test.astype('float32')
 77 |     X_train /= 255
 78 |     X_test /= 255
 79 | 
 80 |     # one-hot-encode the labels
 81 |     Y_train = np_utils.to_categorical(y_train, 10)
 82 |     Y_test = np_utils.to_categorical(y_test, 10)
 83 | 
 84 |     print(X_train.shape)
 85 |     print(Y_train.shape)
 86 |     print(X_test.shape)
 87 |     print(Y_test.shape)
 88 | 
 89 |     return X_train, Y_train, X_test, Y_test
 90 | 
 91 | 
 92 | def get_model(dataset='mnist'):
 93 |     """
 94 |     Takes in a parameter indicating which model type to use ('mnist',
 95 |     'cifar' or 'svhn') and returns the appropriate Keras model.
 96 |     :param dataset: A string indicating which dataset we are building
 97 |                     a model for.
 98 |     :return: The model; a Keras 'Sequential' instance.
 99 |     """
100 |     assert dataset in ['mnist', 'cifar', 'svhn'], \
101 |         "dataset parameter must be either 'mnist' 'cifar' or 'svhn'"
102 |     if dataset == 'mnist':
103 |         # MNIST model
104 |         layers = [
105 |             Conv2D(64, (3, 3), padding='valid', input_shape=(28, 28, 1)),
106 |             Activation('relu'),
107 |             Conv2D(64, (3, 3)),
108 |             Activation('relu'),
109 |             MaxPooling2D(pool_size=(2, 2)),
110 |             Dropout(0.5),
111 |             Flatten(),
112 |             Dense(128),
113 |             Activation('relu'),
114 |             Dropout(0.5),
115 |             Dense(10),
116 |             Activation('softmax')
117 |         ]
118 |     elif dataset == 'cifar':
119 |         # CIFAR-10 model
120 |         layers = [
121 |             Conv2D(32, (3, 3), padding='same', input_shape=(32, 32, 3)),
122 |             Activation('relu'),
123 |             Conv2D(32, (3, 3), padding='same'),
124 |             Activation('relu'),
125 |             MaxPooling2D(pool_size=(2, 2)),
126 |             Conv2D(64, (3, 3), padding='same'),
127 |             Activation('relu'),
128 |             Conv2D(64, (3, 3), padding='same'),
129 |             Activation('relu'),
130 |             MaxPooling2D(pool_size=(2, 2)),
131 |             Conv2D(128, (3, 3), padding='same'),
132 |             Activation('relu'),
133 |             Conv2D(128, (3, 3), padding='same'),
134 |             Activation('relu'),
135 |             MaxPooling2D(pool_size=(2, 2)),
136 |             Flatten(),
137 |             Dropout(0.5),
138 |             Dense(1024, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)),
139 |             Activation('relu'),
140 |             Dropout(0.5),
141 |             Dense(512, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)),
142 |             Activation('relu'),
143 |             Dropout(0.5),
144 |             Dense(10),
145 |             Activation('softmax')
146 |         ]
147 |     else:
148 |         # SVHN model
149 |         layers = [
150 |             Conv2D(64, (3, 3), padding='valid', input_shape=(32, 32, 3)),
151 |             Activation('relu'),
152 |             Conv2D(64, (3, 3)),
153 |             Activation('relu'),
154 |             MaxPooling2D(pool_size=(2, 2)),
155 |             Dropout(0.5),
156 |             Flatten(),
157 |             Dense(512),
158 |             Activation('relu'),
159 |             Dropout(0.5),
160 |             Dense(128),
161 |             Activation('relu'),
162 |             Dropout(0.5),
163 |             Dense(10),
164 |             Activation('softmax')
165 |         ]
166 | 
167 |     model = Sequential()
168 |     for layer in layers:
169 |         model.add(layer)
170 | 
171 |     return model
172 | 
173 | 
174 | def flip(x, nb_diff):
175 |     """
176 |     Helper function for get_noisy_samples
177 |     :param x:
178 |     :param nb_diff:
179 |     :return:
180 |     """
181 |     original_shape = x.shape
182 |     x = np.copy(np.reshape(x, (-1,)))
183 |     candidate_inds = np.where(x < 0.99)[0]
184 |     assert candidate_inds.shape[0] >= nb_diff
185 |     inds = np.random.choice(candidate_inds, nb_diff)
186 |     x[inds] = 1.
187 | 
188 |     return np.reshape(x, original_shape)
189 | 
190 | 
191 | def get_noisy_samples(X_test, X_test_adv, dataset, attack):
192 |     """
193 |     TODO
194 |     :param X_test:
195 |     :param X_test_adv:
196 |     :param dataset:
197 |     :param attack:
198 |     :return:
199 |     """
200 |     if attack in ['jsma', 'cw']:
201 |         X_test_noisy = np.zeros_like(X_test)
202 |         for i in range(len(X_test)):
203 |             # Count the number of pixels that are different
204 |             nb_diff = len(np.where(X_test[i] != X_test_adv[i])[0])
205 |             # Randomly flip an equal number of pixels (flip means move to max
206 |             # value of 1)
207 |             X_test_noisy[i] = flip(X_test[i], nb_diff)
208 |     else:
209 |         warnings.warn("Using pre-set Gaussian scale sizes to craft noisy "
210 |                       "samples. If you've altered the eps/eps-iter parameters "
211 |                       "of the attacks used, you'll need to update these. In "
212 |                       "the future, scale sizes will be inferred automatically "
213 |                       "from the adversarial samples.")
214 |         # Add Gaussian noise to the samples
215 |         X_test_noisy = np.minimum(
216 |             np.maximum(
217 |                 X_test + np.random.normal(loc=0, scale=STDEVS[dataset][attack],
218 |                                           size=X_test.shape),
219 |                 0
220 |             ),
221 |             1
222 |         )
223 | 
224 |     return X_test_noisy
225 | 
226 | 
227 | def get_mc_predictions(model, X, nb_iter=50, batch_size=256):
228 |     """
229 |     TODO
230 |     :param model:
231 |     :param X:
232 |     :param nb_iter:
233 |     :param batch_size:
234 |     :return:
235 |     """
236 |     output_dim = model.layers[-1].output.shape[-1].value
237 |     get_output = K.function(
238 |         [model.layers[0].input, K.learning_phase()],
239 |         [model.layers[-1].output]
240 |     )
241 | 
242 |     def predict():
243 |         n_batches = int(np.ceil(X.shape[0] / float(batch_size)))
244 |         output = np.zeros(shape=(len(X), output_dim))
245 |         for i in range(n_batches):
246 |             output[i * batch_size:(i + 1) * batch_size] = \
247 |                 get_output([X[i * batch_size:(i + 1) * batch_size], 1])[0]
248 |         return output
249 | 
250 |     preds_mc = []
251 |     for i in tqdm(range(nb_iter)):
252 |         preds_mc.append(predict())
253 | 
254 |     return np.asarray(preds_mc)
255 | 
256 | 
257 | def get_deep_representations(model, X, batch_size=256):
258 |     """
259 |     TODO
260 |     :param model:
261 |     :param X:
262 |     :param batch_size:
263 |     :return:
264 |     """
265 |     # last hidden layer is always at index -4
266 |     output_dim = model.layers[-4].output.shape[-1].value
267 |     get_encoding = K.function(
268 |         [model.layers[0].input, K.learning_phase()],
269 |         [model.layers[-4].output]
270 |     )
271 | 
272 |     n_batches = int(np.ceil(X.shape[0] / float(batch_size)))
273 |     output = np.zeros(shape=(len(X), output_dim))
274 |     for i in range(n_batches):
275 |         output[i * batch_size:(i + 1) * batch_size] = \
276 |             get_encoding([X[i * batch_size:(i + 1) * batch_size], 0])[0]
277 | 
278 |     return output
279 | 
280 | 
281 | def score_point(tup):
282 |     """
283 |     TODO
284 |     :param tup:
285 |     :return:
286 |     """
287 |     x, kde = tup
288 | 
289 |     return kde.score_samples(np.reshape(x, (1, -1)))[0]
290 | 
291 | 
292 | def score_samples(kdes, samples, preds, n_jobs=None):
293 |     """
294 |     TODO
295 |     :param kdes:
296 |     :param samples:
297 |     :param preds:
298 |     :param n_jobs:
299 |     :return:
300 |     """
301 |     if n_jobs is not None:
302 |         p = mp.Pool(n_jobs)
303 |     else:
304 |         p = mp.Pool()
305 |     results = np.asarray(
306 |         p.map(
307 |             score_point,
308 |             [(x, kdes[i]) for x, i in zip(samples, preds)]
309 |         )
310 |     )
311 |     p.close()
312 |     p.join()
313 | 
314 |     return results
315 | 
316 | 
317 | def normalize(normal, adv, noisy):
318 |     """
319 |     TODO
320 |     :param normal:
321 |     :param adv:
322 |     :param noisy:
323 |     :return:
324 |     """
325 |     n_samples = len(normal)
326 |     total = scale(np.concatenate((normal, adv, noisy)))
327 | 
328 |     return total[:n_samples], total[n_samples:2*n_samples], total[2*n_samples:]
329 | 
330 | 
331 | def train_lr(densities_pos, densities_neg, uncerts_pos, uncerts_neg):
332 |     """
333 |     TODO
334 |     :param densities_pos:
335 |     :param densities_neg:
336 |     :param uncerts_pos:
337 |     :param uncerts_neg:
338 |     :return:
339 |     """
340 |     values_neg = np.concatenate(
341 |         (densities_neg.reshape((1, -1)),
342 |          uncerts_neg.reshape((1, -1))),
343 |         axis=0).transpose([1, 0])
344 |     values_pos = np.concatenate(
345 |         (densities_pos.reshape((1, -1)),
346 |          uncerts_pos.reshape((1, -1))),
347 |         axis=0).transpose([1, 0])
348 | 
349 |     values = np.concatenate((values_neg, values_pos))
350 |     labels = np.concatenate(
351 |         (np.zeros_like(densities_neg), np.ones_like(densities_pos)))
352 | 
353 |     lr = LogisticRegressionCV(n_jobs=-1).fit(values, labels)
354 | 
355 |     return values, labels, lr
356 | 
357 | 
358 | def compute_roc(probs_neg, probs_pos, plot=False):
359 |     """
360 |     TODO
361 |     :param probs_neg:
362 |     :param probs_pos:
363 |     :param plot:
364 |     :return:
365 |     """
366 |     probs = np.concatenate((probs_neg, probs_pos))
367 |     labels = np.concatenate((np.zeros_like(probs_neg), np.ones_like(probs_pos)))
368 |     fpr, tpr, _ = roc_curve(labels, probs)
369 |     auc_score = auc(fpr, tpr)
370 |     if plot:
371 |         plt.figure(figsize=(7, 6))
372 |         plt.plot(fpr, tpr, color='blue',
373 |                  label='ROC (AUC = %0.4f)' % auc_score)
374 |         plt.legend(loc='lower right')
375 |         plt.title("ROC Curve")
376 |         plt.xlabel("FPR")
377 |         plt.ylabel("TPR")
378 |         plt.show()
379 | 
380 |     return fpr, tpr, auc_score
381 | 


--------------------------------------------------------------------------------