├── detect ├── __init__.py ├── attacks.py └── util.py ├── .gitignore ├── data └── README.md ├── requirements.txt ├── scripts ├── train_model.py ├── craft_adv_samples.py └── detect_adv_samples.py └── README.md /detect/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This is an empty folder where models and adversarial samples will be saved. 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | tqdm 4 | sklearn 5 | matplotlib 6 | tensorflow >= 1.4 7 | Keras >= 2.1 8 | cleverhans >= 2.0 9 | -------------------------------------------------------------------------------- /scripts/train_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | import argparse 4 | 5 | from detect.util import get_data, get_model 6 | 7 | 8 | def main(args): 9 | assert args.dataset in ['mnist', 'cifar', 'svhn'], \ 10 | "dataset parameter must be either 'mnist', 'cifar' or 'svhn'" 11 | print('Data set: %s' % args.dataset) 12 | X_train, Y_train, X_test, Y_test = get_data(args.dataset) 13 | model = get_model(args.dataset) 14 | model.compile( 15 | loss='categorical_crossentropy', 16 | optimizer='adadelta', 17 | metrics=['accuracy'] 18 | ) 19 | model.fit( 20 | X_train, Y_train, 21 | epochs=args.epochs, 22 | batch_size=args.batch_size, 23 | shuffle=True, 24 | verbose=1, 25 | validation_data=(X_test, Y_test) 26 | ) 27 | model.save('../data/model_%s.h5' % args.dataset) 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument( 33 | '-d', '--dataset', 34 | help="Dataset to use; either 'mnist', 'cifar' or 'svhn'", 35 | required=True, type=str 36 | ) 37 | parser.add_argument( 38 | '-e', '--epochs', 39 | help="The number of epochs to train for.", 40 | required=False, type=int 41 | ) 42 | parser.add_argument( 43 | '-b', '--batch_size', 44 | help="The batch size to use for training.", 45 | required=False, type=int 46 | ) 47 | parser.set_defaults(epochs=20) 48 | parser.set_defaults(batch_size=128) 49 | args = parser.parse_args() 50 | main(args) 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Detecting Adversarial Samples from Artifacts 2 | This repository contains the code for the paper [Detecting 3 | Adversarial Samples from Artifacts](https://arxiv.org/abs/1703.00410) 4 | (Feinman et al., 2017). 5 | 6 | ## Requirements & Setup 7 | This code repository requires Keras > 2.0 and TensorFlow. Keras must be 8 | configured to use TensorFlow backend. A full list of requirements can be found 9 | in `requirements.txt`. To install, run the following command to clone the 10 | repository into a folder of your choice: 11 | 12 | git clone https://github.com/rfeinman/detecting-adversarial-samples.git 13 | 14 | On UNIX machines, after cloning this repository, it is 15 | recommended that you add the path to the repository to your `PYTHONPATH` 16 | environment variable to enable imports from any folder: 17 | 18 | export PYTHONPATH="/path/to/detecting-adversarial-samples:$PYTHONPATH" 19 | 20 | 21 | ## Code Structure 22 | The source code is located in the detect/ subfolder, and scripts that users will 23 | run to perform various steps are located in the scripts/ subfolder. An empty 24 | subfolder, data/, is included for storing trained models and adversarial sample 25 | arrays. Instructions for running the code are below. 26 | 27 | ## Running the Code 28 | All of the scripts for running the various parts of the code are located 29 | in the scripts/ subfolder. 30 | 31 | ### 1. Train a new model 32 | To train a new model for a particular data set, simply run 33 | 34 | python train_model.py -d= -e= 35 | 36 | where `` is one of either 'mnist,' 'cifar' or 'svhn,' and `` 37 | is an integer indicating the number of epochs to train for. We recommend using 38 | 10 epochs for MNIST, and 60 for each of CIFAR and SVHN. For example, to train 39 | the MNIST model for 10 epochs, we would run 40 | 41 | python train_model.py -d=mnist -e=10 42 | 43 | The model will be trained and saved into the data/ subfolder and named 44 | `model_.h5`. An optional batch size parameter is also available, 45 | specified with `-b=`. The default training batch size is 128. 46 | 47 | ### 2. Craft adversarial samples 48 | To craft adversarial samples for a particular data set, you must first 49 | train the model for that data set (details above). Then, simply run 50 | 51 | python craft_adv_samples.py -d= -a= 52 | 53 | where `` is the same as above and `` is one of either 'fgsm,' 54 | 'jsma,' 'bim-a,' 'bim-b' or 'all,' indicating which method to use to craft 55 | adversarial samples. For example, to craft adversarial samples for the 56 | MNIST model using FGSM, we would run 57 | 58 | python craft_adv_samples.py -d=mnist -a=fgsm 59 | 60 | If 'all' is chosen (the default), all types of adversarial samples will be 61 | generated. Arrays holding the adversarial samples are stored in the data/ 62 | subfolder and named `Adv__.npy`. An optional batch size 63 | parameter for evaluating adversarial samples is again provided 64 | (`-b=`). The default is 256. 65 | 66 | ### 3. Detect adversarial samples 67 | To run the detection script, you must first train the model and craft 68 | adversarial samples for each data set you would like to use (details above). 69 | Then, simply run 70 | 71 | python detect_adv_samples.py -d= -a= 72 | 73 | where `` and `` are the same as described above. An optional 74 | batch size parameter is again provided (`-b=`). For all of the 75 | adversarial samples provided, an equal number of noisy samples will be generated 76 | and included alongside the original samples as part of the 'negative' class 77 | for the detector. The perturbation size of these noisy samples is determined 78 | based on the average L2 perturbation size of the adversarial samples. Then, 79 | the Bayesian uncertainty and kernel density features will be computed for each 80 | of the normal, noisy and adversarial samples. A logistic regression model is 81 | trained on these features and the detector is built. 82 | 83 | ## MNIST Demonstration 84 | Here, a simple demonstration is provided of the commands issued to run the full 85 | experiment with MNIST, using the FGSM attack. The following commands are used 86 | to run all 3 steps: 87 | 88 | 1. python train_model.py -d=mnist -e=10 89 | 2. python craft_adv_samples.py -d=mnist -a=fgsm 90 | 3. python detect_adv_samples.py -d=mnist -a=fgsm 91 | -------------------------------------------------------------------------------- /scripts/craft_adv_samples.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | import os 4 | import argparse 5 | import numpy as np 6 | import tensorflow as tf 7 | import keras.backend as K 8 | from keras.models import load_model 9 | 10 | from detect.util import get_data 11 | from detect.attacks import (fast_gradient_sign_method, basic_iterative_method, 12 | saliency_map_method) 13 | 14 | # FGSM & BIM attack parameters that were chosen 15 | ATTACK_PARAMS = { 16 | 'mnist': {'eps': 0.300, 'eps_iter': 0.010}, 17 | 'cifar': {'eps': 0.050, 'eps_iter': 0.005}, 18 | 'svhn': {'eps': 0.130, 'eps_iter': 0.010} 19 | } 20 | 21 | 22 | def craft_one_type(sess, model, X, Y, dataset, attack, batch_size): 23 | """ 24 | TODO 25 | :param sess: 26 | :param model: 27 | :param X: 28 | :param Y: 29 | :param dataset: 30 | :param attack: 31 | :param batch_size: 32 | :return: 33 | """ 34 | if attack == 'fgsm': 35 | # FGSM attack 36 | print('Crafting fgsm adversarial samples...') 37 | X_adv = fast_gradient_sign_method( 38 | sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], clip_min=0., 39 | clip_max=1., batch_size=batch_size 40 | ) 41 | elif attack in ['bim-a', 'bim-b']: 42 | # BIM attack 43 | print('Crafting %s adversarial samples...' % attack) 44 | its, results = basic_iterative_method( 45 | sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], 46 | eps_iter=ATTACK_PARAMS[dataset]['eps_iter'], clip_min=0., 47 | clip_max=1., batch_size=batch_size 48 | ) 49 | if attack == 'bim-a': 50 | # BIM-A 51 | # For each sample, select the time step where that sample first 52 | # became misclassified 53 | X_adv = np.asarray([results[its[i], i] for i in range(len(Y))]) 54 | else: 55 | # BIM-B 56 | # For each sample, select the very last time step 57 | X_adv = results[-1] 58 | elif attack == 'jsma': 59 | # JSMA attack 60 | print('Crafting jsma adversarial samples. This may take a while...') 61 | X_adv = saliency_map_method( 62 | sess, model, X, Y, theta=1, gamma=0.1, clip_min=0., clip_max=1. 63 | ) 64 | else: 65 | # TODO: CW attack 66 | raise NotImplementedError('CW attack not yet implemented.') 67 | _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, 68 | verbose=0) 69 | print("Model accuracy on the adversarial test set: %0.2f%%" % (100 * acc)) 70 | np.save('../data/Adv_%s_%s.npy' % (args.dataset, args.attack), X_adv) 71 | 72 | 73 | def main(args): 74 | assert args.dataset in ['mnist', 'cifar', 'svhn'], \ 75 | "Dataset parameter must be either 'mnist', 'cifar' or 'svhn'" 76 | assert args.attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw', 'all'], \ 77 | "Attack parameter must be either 'fgsm', 'bim-a', 'bim-b', " \ 78 | "'jsma' or 'cw'" 79 | assert os.path.isfile('../data/model_%s.h5' % args.dataset), \ 80 | 'model file not found... must first train model using train_model.py.' 81 | print('Dataset: %s. Attack: %s' % (args.dataset, args.attack)) 82 | # Create TF session, set it as Keras backend 83 | sess = tf.Session() 84 | K.set_session(sess) 85 | K.set_learning_phase(0) 86 | model = load_model('../data/model_%s.h5' % args.dataset) 87 | _, _, X_test, Y_test = get_data(args.dataset) 88 | _, acc = model.evaluate(X_test, Y_test, batch_size=args.batch_size, 89 | verbose=0) 90 | print("Accuracy on the test set: %0.2f%%" % (100*acc)) 91 | if args.attack == 'all': 92 | # Cycle through all attacks 93 | for attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw']: 94 | craft_one_type(sess, model, X_test, Y_test, args.dataset, attack, 95 | args.batch_size) 96 | else: 97 | # Craft one specific attack type 98 | craft_one_type(sess, model, X_test, Y_test, args.dataset, args.attack, 99 | args.batch_size) 100 | print('Adversarial samples crafted and saved to data/ subfolder.') 101 | sess.close() 102 | 103 | 104 | if __name__ == "__main__": 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument( 107 | '-d', '--dataset', 108 | help="Dataset to use; either 'mnist', 'cifar' or 'svhn'", 109 | required=True, type=str 110 | ) 111 | parser.add_argument( 112 | '-a', '--attack', 113 | help="Attack to use; either 'fgsm', 'bim-a', 'bim-b', 'jsma', 'cw' " 114 | "or 'all'", 115 | required=True, type=str 116 | ) 117 | parser.add_argument( 118 | '-b', '--batch_size', 119 | help="The batch size to use for training.", 120 | required=False, type=int 121 | ) 122 | parser.set_defaults(batch_size=256) 123 | args = parser.parse_args() 124 | main(args) 125 | -------------------------------------------------------------------------------- /detect/attacks.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from collections import defaultdict 4 | import numpy as np 5 | import tensorflow as tf 6 | from tqdm import tqdm 7 | from cleverhans.utils import other_classes 8 | from cleverhans.utils_tf import batch_eval, model_argmax 9 | from cleverhans.attacks import SaliencyMapMethod 10 | 11 | 12 | def fgsm(x, predictions, eps, clip_min=None, clip_max=None, y=None): 13 | """ 14 | Computes symbolic TF tensor for the adversarial samples. This must 15 | be evaluated with a session.run call. 16 | :param x: the input placeholder 17 | :param predictions: the model's output tensor 18 | :param eps: the epsilon (input variation parameter) 19 | :param clip_min: optional parameter that can be used to set a minimum 20 | value for components of the example returned 21 | :param clip_max: optional parameter that can be used to set a maximum 22 | value for components of the example returned 23 | :param y: the output placeholder. Use None (the default) to avoid the 24 | label leaking effect. 25 | :return: a tensor for the adversarial example 26 | """ 27 | 28 | # Compute loss 29 | if y is None: 30 | # In this case, use model predictions as ground truth 31 | y = tf.to_float( 32 | tf.equal(predictions, 33 | tf.reduce_max(predictions, 1, keep_dims=True))) 34 | y = y / tf.reduce_sum(y, 1, keep_dims=True) 35 | logits, = predictions.op.inputs 36 | loss = tf.reduce_mean( 37 | tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y) 38 | ) 39 | 40 | # Define gradient of loss wrt input 41 | grad, = tf.gradients(loss, x) 42 | 43 | # Take sign of gradient 44 | signed_grad = tf.sign(grad) 45 | 46 | # Multiply by constant epsilon 47 | scaled_signed_grad = eps * signed_grad 48 | 49 | # Add perturbation to original example to obtain adversarial example 50 | adv_x = tf.stop_gradient(x + scaled_signed_grad) 51 | 52 | # If clipping is needed, reset all values outside of [clip_min, clip_max] 53 | if (clip_min is not None) and (clip_max is not None): 54 | adv_x = tf.clip_by_value(adv_x, clip_min, clip_max) 55 | 56 | return adv_x 57 | 58 | def fast_gradient_sign_method(sess, model, X, Y, eps, clip_min=None, 59 | clip_max=None, batch_size=256): 60 | """ 61 | TODO 62 | :param sess: 63 | :param model: 64 | :param X: 65 | :param Y: 66 | :param eps: 67 | :param clip_min: 68 | :param clip_max: 69 | :param batch_size: 70 | :return: 71 | """ 72 | # Define TF placeholders for the input and output 73 | x = tf.placeholder(tf.float32, shape=(None,) + X.shape[1:]) 74 | y = tf.placeholder(tf.float32, shape=(None,) + Y.shape[1:]) 75 | adv_x = fgsm( 76 | x, model(x), eps=eps, 77 | clip_min=clip_min, 78 | clip_max=clip_max, y=y 79 | ) 80 | X_adv, = batch_eval( 81 | sess, [x, y], [adv_x], 82 | [X, Y], args={'batch_size': batch_size} 83 | ) 84 | 85 | return X_adv 86 | 87 | def basic_iterative_method(sess, model, X, Y, eps, eps_iter, nb_iter=50, 88 | clip_min=None, clip_max=None, batch_size=256): 89 | """ 90 | TODO 91 | :param sess: 92 | :param model: 93 | :param X: 94 | :param Y: 95 | :param eps: 96 | :param eps_iter: 97 | :param nb_iter: 98 | :param clip_min: 99 | :param clip_max: 100 | :param batch_size: 101 | :return: 102 | """ 103 | # Define TF placeholders for the input and output 104 | x = tf.placeholder(tf.float32, shape=(None,)+X.shape[1:]) 105 | y = tf.placeholder(tf.float32, shape=(None,)+Y.shape[1:]) 106 | # results will hold the adversarial inputs at each iteration of BIM; 107 | # thus it will have shape (nb_iter, n_samples, n_rows, n_cols, n_channels) 108 | results = np.zeros((nb_iter, X.shape[0],) + X.shape[1:]) 109 | # Initialize adversarial samples as the original samples, set upper and 110 | # lower bounds 111 | X_adv = X 112 | X_min = X_adv - eps 113 | X_max = X_adv + eps 114 | print('Running BIM iterations...') 115 | # "its" is a dictionary that keeps track of the iteration at which each 116 | # sample becomes misclassified. The default value will be (nb_iter-1), the 117 | # very last iteration. 118 | def f(val): 119 | return lambda: val 120 | its = defaultdict(f(nb_iter-1)) 121 | # Out keeps track of which samples have already been misclassified 122 | out = set() 123 | for i in tqdm(range(nb_iter)): 124 | adv_x = fgsm( 125 | x, model(x), eps=eps_iter, 126 | clip_min=clip_min, clip_max=clip_max, y=y 127 | ) 128 | X_adv, = batch_eval( 129 | sess, [x, y], [adv_x], 130 | [X_adv, Y], args={'batch_size': batch_size} 131 | ) 132 | X_adv = np.maximum(np.minimum(X_adv, X_max), X_min) 133 | results[i] = X_adv 134 | # check misclassifieds 135 | predictions = model.predict_classes(X_adv, batch_size=512, verbose=0) 136 | misclassifieds = np.where(predictions != Y.argmax(axis=1))[0] 137 | for elt in misclassifieds: 138 | if elt not in out: 139 | its[elt] = i 140 | out.add(elt) 141 | 142 | return its, results 143 | 144 | def saliency_map_method(sess, model, X, Y, theta, gamma, clip_min=None, 145 | clip_max=None): 146 | """ 147 | 148 | :param sess: 149 | :param model: 150 | :param X: 151 | :param Y: 152 | :param theta: 153 | :param gamma: 154 | :param clip_min: 155 | :param clip_max: 156 | :return: 157 | """ 158 | nb_classes = Y.shape[1] 159 | X_adv = np.zeros_like(X) 160 | # Instantiate a SaliencyMapMethod attack object 161 | jsma = SaliencyMapMethod(model, back='tf', sess=sess) 162 | jsma_params = {'theta': theta, 'gamma': gamma, 163 | 'clip_min': clip_min, 'clip_max': clip_max, 164 | 'y_target': None} 165 | for i in tqdm(range(len(X))): 166 | # Get the sample 167 | sample = X[i:(i+1)] 168 | # First, record the current class of the sample 169 | current_class = int(np.argmax(Y[i])) 170 | # Randomly choose a target class 171 | target_class = np.random.choice(other_classes(nb_classes, 172 | current_class)) 173 | # This call runs the Jacobian-based saliency map approach 174 | one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) 175 | one_hot_target[0, target_class] = 1 176 | jsma_params['y_target'] = one_hot_target 177 | X_adv[i] = jsma.generate_np(sample, **jsma_params) 178 | 179 | return X_adv -------------------------------------------------------------------------------- /scripts/detect_adv_samples.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | import os 4 | import argparse 5 | import warnings 6 | import numpy as np 7 | from sklearn.neighbors import KernelDensity 8 | from keras.models import load_model 9 | 10 | from detect.util import (get_data, get_noisy_samples, get_mc_predictions, 11 | get_deep_representations, score_samples, normalize, 12 | train_lr, compute_roc) 13 | 14 | # Optimal KDE bandwidths that were determined from CV tuning 15 | BANDWIDTHS = {'mnist': 1.20, 'cifar': 0.26, 'svhn': 1.00} 16 | 17 | 18 | def main(args): 19 | assert args.dataset in ['mnist', 'cifar', 'svhn'], \ 20 | "Dataset parameter must be either 'mnist', 'cifar' or 'svhn'" 21 | assert args.attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw', 'all'], \ 22 | "Attack parameter must be either 'fgsm', 'bim-a', 'bim-b', " \ 23 | "'jsma' or 'cw'" 24 | assert os.path.isfile('../data/model_%s.h5' % args.dataset), \ 25 | 'model file not found... must first train model using train_model.py.' 26 | assert os.path.isfile('../data/Adv_%s_%s.npy' % 27 | (args.dataset, args.attack)), \ 28 | 'adversarial sample file not found... must first craft adversarial ' \ 29 | 'samples using craft_adv_samples.py' 30 | print('Loading the data and model...') 31 | # Load the model 32 | model = load_model('../data/model_%s.h5' % args.dataset) 33 | # Load the dataset 34 | X_train, Y_train, X_test, Y_test = get_data(args.dataset) 35 | # Check attack type, select adversarial and noisy samples accordingly 36 | print('Loading noisy and adversarial samples...') 37 | if args.attack == 'all': 38 | # TODO: implement 'all' option 39 | #X_test_adv = ... 40 | #X_test_noisy = ... 41 | raise NotImplementedError("'All' types detector not yet implemented.") 42 | else: 43 | # Load adversarial samples 44 | X_test_adv = np.load('../data/Adv_%s_%s.npy' % (args.dataset, 45 | args.attack)) 46 | # Craft an equal number of noisy samples 47 | X_test_noisy = get_noisy_samples(X_test, X_test_adv, args.dataset, 48 | args.attack) 49 | # Check model accuracies on each sample type 50 | for s_type, dataset in zip(['normal', 'noisy', 'adversarial'], 51 | [X_test, X_test_noisy, X_test_adv]): 52 | _, acc = model.evaluate(dataset, Y_test, batch_size=args.batch_size, 53 | verbose=0) 54 | print("Model accuracy on the %s test set: %0.2f%%" % 55 | (s_type, 100 * acc)) 56 | # Compute and display average perturbation sizes 57 | if not s_type == 'normal': 58 | l2_diff = np.linalg.norm( 59 | dataset.reshape((len(X_test), -1)) - 60 | X_test.reshape((len(X_test), -1)), 61 | axis=1 62 | ).mean() 63 | print("Average L-2 perturbation size of the %s test set: %0.2f" % 64 | (s_type, l2_diff)) 65 | # Refine the normal, noisy and adversarial sets to only include samples for 66 | # which the original version was correctly classified by the model 67 | preds_test = model.predict_classes(X_test, verbose=0, 68 | batch_size=args.batch_size) 69 | inds_correct = np.where(preds_test == Y_test.argmax(axis=1))[0] 70 | X_test = X_test[inds_correct] 71 | X_test_noisy = X_test_noisy[inds_correct] 72 | X_test_adv = X_test_adv[inds_correct] 73 | 74 | ## Get Bayesian uncertainty scores 75 | print('Getting Monte Carlo dropout variance predictions...') 76 | uncerts_normal = get_mc_predictions(model, X_test, 77 | batch_size=args.batch_size) \ 78 | .var(axis=0).mean(axis=1) 79 | uncerts_noisy = get_mc_predictions(model, X_test_noisy, 80 | batch_size=args.batch_size) \ 81 | .var(axis=0).mean(axis=1) 82 | uncerts_adv = get_mc_predictions(model, X_test_adv, 83 | batch_size=args.batch_size) \ 84 | .var(axis=0).mean(axis=1) 85 | 86 | ## Get KDE scores 87 | # Get deep feature representations 88 | print('Getting deep feature representations...') 89 | X_train_features = get_deep_representations(model, X_train, 90 | batch_size=args.batch_size) 91 | X_test_normal_features = get_deep_representations(model, X_test, 92 | batch_size=args.batch_size) 93 | X_test_noisy_features = get_deep_representations(model, X_test_noisy, 94 | batch_size=args.batch_size) 95 | X_test_adv_features = get_deep_representations(model, X_test_adv, 96 | batch_size=args.batch_size) 97 | # Train one KDE per class 98 | print('Training KDEs...') 99 | class_inds = {} 100 | for i in range(Y_train.shape[1]): 101 | class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0] 102 | kdes = {} 103 | warnings.warn("Using pre-set kernel bandwidths that were determined " 104 | "optimal for the specific CNN models of the paper. If you've " 105 | "changed your model, you'll need to re-optimize the " 106 | "bandwidth.") 107 | for i in range(Y_train.shape[1]): 108 | kdes[i] = KernelDensity(kernel='gaussian', 109 | bandwidth=BANDWIDTHS[args.dataset]) \ 110 | .fit(X_train_features[class_inds[i]]) 111 | # Get model predictions 112 | print('Computing model predictions...') 113 | preds_test_normal = model.predict_classes(X_test, verbose=0, 114 | batch_size=args.batch_size) 115 | preds_test_noisy = model.predict_classes(X_test_noisy, verbose=0, 116 | batch_size=args.batch_size) 117 | preds_test_adv = model.predict_classes(X_test_adv, verbose=0, 118 | batch_size=args.batch_size) 119 | # Get density estimates 120 | print('computing densities...') 121 | densities_normal = score_samples( 122 | kdes, 123 | X_test_normal_features, 124 | preds_test_normal 125 | ) 126 | densities_noisy = score_samples( 127 | kdes, 128 | X_test_noisy_features, 129 | preds_test_noisy 130 | ) 131 | densities_adv = score_samples( 132 | kdes, 133 | X_test_adv_features, 134 | preds_test_adv 135 | ) 136 | 137 | ## Z-score the uncertainty and density values 138 | uncerts_normal_z, uncerts_adv_z, uncerts_noisy_z = normalize( 139 | uncerts_normal, 140 | uncerts_adv, 141 | uncerts_noisy 142 | ) 143 | densities_normal_z, densities_adv_z, densities_noisy_z = normalize( 144 | densities_normal, 145 | densities_adv, 146 | densities_noisy 147 | ) 148 | 149 | ## Build detector 150 | values, labels, lr = train_lr( 151 | densities_pos=densities_adv_z, 152 | densities_neg=np.concatenate((densities_normal_z, densities_noisy_z)), 153 | uncerts_pos=uncerts_adv_z, 154 | uncerts_neg=np.concatenate((uncerts_normal_z, uncerts_noisy_z)) 155 | ) 156 | 157 | ## Evaluate detector 158 | # Compute logistic regression model predictions 159 | probs = lr.predict_proba(values)[:, 1] 160 | # Compute AUC 161 | n_samples = len(X_test) 162 | # The first 2/3 of 'probs' is the negative class (normal and noisy samples), 163 | # and the last 1/3 is the positive class (adversarial samples). 164 | _, _, auc_score = compute_roc( 165 | probs_neg=probs[:2 * n_samples], 166 | probs_pos=probs[2 * n_samples:] 167 | ) 168 | print('Detector ROC-AUC score: %0.4f' % auc_score) 169 | 170 | 171 | if __name__ == "__main__": 172 | parser = argparse.ArgumentParser() 173 | parser.add_argument( 174 | '-d', '--dataset', 175 | help="Dataset to use; either 'mnist', 'cifar' or 'svhn'", 176 | required=True, type=str 177 | ) 178 | parser.add_argument( 179 | '-a', '--attack', 180 | help="Attack to use; either 'fgsm', 'bim-a', 'bim-b', 'jsma' 'cw' " 181 | "or 'all'", 182 | required=True, type=str 183 | ) 184 | parser.add_argument( 185 | '-b', '--batch_size', 186 | help="The batch size to use for training.", 187 | required=False, type=int 188 | ) 189 | parser.set_defaults(batch_size=256) 190 | args = parser.parse_args() 191 | main(args) 192 | -------------------------------------------------------------------------------- /detect/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | import os 4 | import multiprocessing as mp 5 | from subprocess import call 6 | import warnings 7 | import numpy as np 8 | import scipy.io as sio 9 | from tqdm import tqdm 10 | import matplotlib.pyplot as plt 11 | from sklearn.metrics import roc_curve, auc 12 | from sklearn.linear_model import LogisticRegressionCV 13 | from sklearn.preprocessing import scale 14 | import keras.backend as K 15 | from keras.datasets import mnist, cifar10 16 | from keras.utils import np_utils 17 | from keras.models import Sequential 18 | from keras.layers import Dense, Dropout, Activation, Flatten 19 | from keras.layers import Conv2D, MaxPooling2D 20 | from keras.regularizers import l2 21 | 22 | # Gaussian noise scale sizes that were determined so that the average 23 | # L-2 perturbation size is equal to that of the adversarial samples 24 | STDEVS = { 25 | 'mnist': {'fgsm': 0.310, 'bim-a': 0.128, 'bim-b': 0.265}, 26 | 'cifar': {'fgsm': 0.050, 'bim-a': 0.009, 'bim-b': 0.039}, 27 | 'svhn': {'fgsm': 0.132, 'bim-a': 0.015, 'bim-b': 0.122} 28 | } 29 | # Set random seed 30 | np.random.seed(0) 31 | 32 | 33 | def get_data(dataset='mnist'): 34 | """ 35 | TODO 36 | :param dataset: 37 | :return: 38 | """ 39 | assert dataset in ['mnist', 'cifar', 'svhn'], \ 40 | "dataset parameter must be either 'mnist' 'cifar' or 'svhn'" 41 | if dataset == 'mnist': 42 | # the data, shuffled and split between train and test sets 43 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 44 | # reshape to (n_samples, 28, 28, 1) 45 | X_train = X_train.reshape(-1, 28, 28, 1) 46 | X_test = X_test.reshape(-1, 28, 28, 1) 47 | elif dataset == 'cifar': 48 | # the data, shuffled and split between train and test sets 49 | (X_train, y_train), (X_test, y_test) = cifar10.load_data() 50 | else: 51 | if not os.path.isfile("../data/svhn_train.mat"): 52 | print('Downloading SVHN train set...') 53 | call( 54 | "curl -o ../data/svhn_train.mat " 55 | "http://ufldl.stanford.edu/housenumbers/train_32x32.mat", 56 | shell=True 57 | ) 58 | if not os.path.isfile("../data/svhn_test.mat"): 59 | print('Downloading SVHN test set...') 60 | call( 61 | "curl -o ../data/svhn_test.mat " 62 | "http://ufldl.stanford.edu/housenumbers/test_32x32.mat", 63 | shell=True 64 | ) 65 | train = sio.loadmat('../data/svhn_train.mat') 66 | test = sio.loadmat('../data/svhn_test.mat') 67 | X_train = np.transpose(train['X'], axes=[3, 0, 1, 2]) 68 | X_test = np.transpose(test['X'], axes=[3, 0, 1, 2]) 69 | # reshape (n_samples, 1) to (n_samples,) and change 1-index 70 | # to 0-index 71 | y_train = np.reshape(train['y'], (-1,)) - 1 72 | y_test = np.reshape(test['y'], (-1,)) - 1 73 | 74 | # cast pixels to floats, normalize to [0, 1] range 75 | X_train = X_train.astype('float32') 76 | X_test = X_test.astype('float32') 77 | X_train /= 255 78 | X_test /= 255 79 | 80 | # one-hot-encode the labels 81 | Y_train = np_utils.to_categorical(y_train, 10) 82 | Y_test = np_utils.to_categorical(y_test, 10) 83 | 84 | print(X_train.shape) 85 | print(Y_train.shape) 86 | print(X_test.shape) 87 | print(Y_test.shape) 88 | 89 | return X_train, Y_train, X_test, Y_test 90 | 91 | 92 | def get_model(dataset='mnist'): 93 | """ 94 | Takes in a parameter indicating which model type to use ('mnist', 95 | 'cifar' or 'svhn') and returns the appropriate Keras model. 96 | :param dataset: A string indicating which dataset we are building 97 | a model for. 98 | :return: The model; a Keras 'Sequential' instance. 99 | """ 100 | assert dataset in ['mnist', 'cifar', 'svhn'], \ 101 | "dataset parameter must be either 'mnist' 'cifar' or 'svhn'" 102 | if dataset == 'mnist': 103 | # MNIST model 104 | layers = [ 105 | Conv2D(64, (3, 3), padding='valid', input_shape=(28, 28, 1)), 106 | Activation('relu'), 107 | Conv2D(64, (3, 3)), 108 | Activation('relu'), 109 | MaxPooling2D(pool_size=(2, 2)), 110 | Dropout(0.5), 111 | Flatten(), 112 | Dense(128), 113 | Activation('relu'), 114 | Dropout(0.5), 115 | Dense(10), 116 | Activation('softmax') 117 | ] 118 | elif dataset == 'cifar': 119 | # CIFAR-10 model 120 | layers = [ 121 | Conv2D(32, (3, 3), padding='same', input_shape=(32, 32, 3)), 122 | Activation('relu'), 123 | Conv2D(32, (3, 3), padding='same'), 124 | Activation('relu'), 125 | MaxPooling2D(pool_size=(2, 2)), 126 | Conv2D(64, (3, 3), padding='same'), 127 | Activation('relu'), 128 | Conv2D(64, (3, 3), padding='same'), 129 | Activation('relu'), 130 | MaxPooling2D(pool_size=(2, 2)), 131 | Conv2D(128, (3, 3), padding='same'), 132 | Activation('relu'), 133 | Conv2D(128, (3, 3), padding='same'), 134 | Activation('relu'), 135 | MaxPooling2D(pool_size=(2, 2)), 136 | Flatten(), 137 | Dropout(0.5), 138 | Dense(1024, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)), 139 | Activation('relu'), 140 | Dropout(0.5), 141 | Dense(512, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)), 142 | Activation('relu'), 143 | Dropout(0.5), 144 | Dense(10), 145 | Activation('softmax') 146 | ] 147 | else: 148 | # SVHN model 149 | layers = [ 150 | Conv2D(64, (3, 3), padding='valid', input_shape=(32, 32, 3)), 151 | Activation('relu'), 152 | Conv2D(64, (3, 3)), 153 | Activation('relu'), 154 | MaxPooling2D(pool_size=(2, 2)), 155 | Dropout(0.5), 156 | Flatten(), 157 | Dense(512), 158 | Activation('relu'), 159 | Dropout(0.5), 160 | Dense(128), 161 | Activation('relu'), 162 | Dropout(0.5), 163 | Dense(10), 164 | Activation('softmax') 165 | ] 166 | 167 | model = Sequential() 168 | for layer in layers: 169 | model.add(layer) 170 | 171 | return model 172 | 173 | 174 | def flip(x, nb_diff): 175 | """ 176 | Helper function for get_noisy_samples 177 | :param x: 178 | :param nb_diff: 179 | :return: 180 | """ 181 | original_shape = x.shape 182 | x = np.copy(np.reshape(x, (-1,))) 183 | candidate_inds = np.where(x < 0.99)[0] 184 | assert candidate_inds.shape[0] >= nb_diff 185 | inds = np.random.choice(candidate_inds, nb_diff) 186 | x[inds] = 1. 187 | 188 | return np.reshape(x, original_shape) 189 | 190 | 191 | def get_noisy_samples(X_test, X_test_adv, dataset, attack): 192 | """ 193 | TODO 194 | :param X_test: 195 | :param X_test_adv: 196 | :param dataset: 197 | :param attack: 198 | :return: 199 | """ 200 | if attack in ['jsma', 'cw']: 201 | X_test_noisy = np.zeros_like(X_test) 202 | for i in range(len(X_test)): 203 | # Count the number of pixels that are different 204 | nb_diff = len(np.where(X_test[i] != X_test_adv[i])[0]) 205 | # Randomly flip an equal number of pixels (flip means move to max 206 | # value of 1) 207 | X_test_noisy[i] = flip(X_test[i], nb_diff) 208 | else: 209 | warnings.warn("Using pre-set Gaussian scale sizes to craft noisy " 210 | "samples. If you've altered the eps/eps-iter parameters " 211 | "of the attacks used, you'll need to update these. In " 212 | "the future, scale sizes will be inferred automatically " 213 | "from the adversarial samples.") 214 | # Add Gaussian noise to the samples 215 | X_test_noisy = np.minimum( 216 | np.maximum( 217 | X_test + np.random.normal(loc=0, scale=STDEVS[dataset][attack], 218 | size=X_test.shape), 219 | 0 220 | ), 221 | 1 222 | ) 223 | 224 | return X_test_noisy 225 | 226 | 227 | def get_mc_predictions(model, X, nb_iter=50, batch_size=256): 228 | """ 229 | TODO 230 | :param model: 231 | :param X: 232 | :param nb_iter: 233 | :param batch_size: 234 | :return: 235 | """ 236 | output_dim = model.layers[-1].output.shape[-1].value 237 | get_output = K.function( 238 | [model.layers[0].input, K.learning_phase()], 239 | [model.layers[-1].output] 240 | ) 241 | 242 | def predict(): 243 | n_batches = int(np.ceil(X.shape[0] / float(batch_size))) 244 | output = np.zeros(shape=(len(X), output_dim)) 245 | for i in range(n_batches): 246 | output[i * batch_size:(i + 1) * batch_size] = \ 247 | get_output([X[i * batch_size:(i + 1) * batch_size], 1])[0] 248 | return output 249 | 250 | preds_mc = [] 251 | for i in tqdm(range(nb_iter)): 252 | preds_mc.append(predict()) 253 | 254 | return np.asarray(preds_mc) 255 | 256 | 257 | def get_deep_representations(model, X, batch_size=256): 258 | """ 259 | TODO 260 | :param model: 261 | :param X: 262 | :param batch_size: 263 | :return: 264 | """ 265 | # last hidden layer is always at index -4 266 | output_dim = model.layers[-4].output.shape[-1].value 267 | get_encoding = K.function( 268 | [model.layers[0].input, K.learning_phase()], 269 | [model.layers[-4].output] 270 | ) 271 | 272 | n_batches = int(np.ceil(X.shape[0] / float(batch_size))) 273 | output = np.zeros(shape=(len(X), output_dim)) 274 | for i in range(n_batches): 275 | output[i * batch_size:(i + 1) * batch_size] = \ 276 | get_encoding([X[i * batch_size:(i + 1) * batch_size], 0])[0] 277 | 278 | return output 279 | 280 | 281 | def score_point(tup): 282 | """ 283 | TODO 284 | :param tup: 285 | :return: 286 | """ 287 | x, kde = tup 288 | 289 | return kde.score_samples(np.reshape(x, (1, -1)))[0] 290 | 291 | 292 | def score_samples(kdes, samples, preds, n_jobs=None): 293 | """ 294 | TODO 295 | :param kdes: 296 | :param samples: 297 | :param preds: 298 | :param n_jobs: 299 | :return: 300 | """ 301 | if n_jobs is not None: 302 | p = mp.Pool(n_jobs) 303 | else: 304 | p = mp.Pool() 305 | results = np.asarray( 306 | p.map( 307 | score_point, 308 | [(x, kdes[i]) for x, i in zip(samples, preds)] 309 | ) 310 | ) 311 | p.close() 312 | p.join() 313 | 314 | return results 315 | 316 | 317 | def normalize(normal, adv, noisy): 318 | """ 319 | TODO 320 | :param normal: 321 | :param adv: 322 | :param noisy: 323 | :return: 324 | """ 325 | n_samples = len(normal) 326 | total = scale(np.concatenate((normal, adv, noisy))) 327 | 328 | return total[:n_samples], total[n_samples:2*n_samples], total[2*n_samples:] 329 | 330 | 331 | def train_lr(densities_pos, densities_neg, uncerts_pos, uncerts_neg): 332 | """ 333 | TODO 334 | :param densities_pos: 335 | :param densities_neg: 336 | :param uncerts_pos: 337 | :param uncerts_neg: 338 | :return: 339 | """ 340 | values_neg = np.concatenate( 341 | (densities_neg.reshape((1, -1)), 342 | uncerts_neg.reshape((1, -1))), 343 | axis=0).transpose([1, 0]) 344 | values_pos = np.concatenate( 345 | (densities_pos.reshape((1, -1)), 346 | uncerts_pos.reshape((1, -1))), 347 | axis=0).transpose([1, 0]) 348 | 349 | values = np.concatenate((values_neg, values_pos)) 350 | labels = np.concatenate( 351 | (np.zeros_like(densities_neg), np.ones_like(densities_pos))) 352 | 353 | lr = LogisticRegressionCV(n_jobs=-1).fit(values, labels) 354 | 355 | return values, labels, lr 356 | 357 | 358 | def compute_roc(probs_neg, probs_pos, plot=False): 359 | """ 360 | TODO 361 | :param probs_neg: 362 | :param probs_pos: 363 | :param plot: 364 | :return: 365 | """ 366 | probs = np.concatenate((probs_neg, probs_pos)) 367 | labels = np.concatenate((np.zeros_like(probs_neg), np.ones_like(probs_pos))) 368 | fpr, tpr, _ = roc_curve(labels, probs) 369 | auc_score = auc(fpr, tpr) 370 | if plot: 371 | plt.figure(figsize=(7, 6)) 372 | plt.plot(fpr, tpr, color='blue', 373 | label='ROC (AUC = %0.4f)' % auc_score) 374 | plt.legend(loc='lower right') 375 | plt.title("ROC Curve") 376 | plt.xlabel("FPR") 377 | plt.ylabel("TPR") 378 | plt.show() 379 | 380 | return fpr, tpr, auc_score 381 | --------------------------------------------------------------------------------