├── LICENSE ├── act_values.py ├── delta_defense.py ├── README.md ├── .gitignore ├── multiclass_deltas.py ├── utils.py ├── sensitivity_training.py └── optimal_impostor.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Anshuman Suri 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /act_values.py: -------------------------------------------------------------------------------- 1 | import torch as ch 2 | import utils 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | 7 | if __name__ == "__main__": 8 | import sys 9 | 10 | model_arch = sys.argv[1] 11 | model_type = sys.argv[2] 12 | prefix = sys.argv[3] 13 | dataset = sys.argv[4] 14 | 15 | if dataset == 'cifar10': 16 | dx = utils.CIFAR10() 17 | elif dataset == 'imagenet': 18 | dx = utils.ImageNet1000() 19 | else: 20 | raise ValueError("Dataset not supported") 21 | 22 | ds = dx.get_dataset() 23 | model = dx.get_model(model_type, model_arch) 24 | 25 | batch_size = 128 26 | all_reps = [] 27 | train_loader = None 28 | if dataset == 'cifar10': 29 | train_loader, val_loader = ds.make_loaders(batch_size=batch_size, workers=8) 30 | else: 31 | _, val_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True) 32 | 33 | def get_reps(data_loader): 34 | for (im, label) in tqdm(data_loader): 35 | with ch.no_grad(): 36 | (_, rep), _ = model(im.cuda(), with_latent=True) 37 | all_reps.append(rep.cpu()) 38 | 39 | if train_loader: 40 | get_reps(train_loader) 41 | get_reps(val_loader) 42 | 43 | all_reps = ch.cat(all_reps) 44 | ch_mean = ch.mean(all_reps, dim=0) 45 | ch_std = ch.std(all_reps, dim=0) 46 | 47 | # Dump mean, std vectors for later use: 48 | np_mean = ch_mean.cpu().numpy() 49 | np_std = ch_std.cpu().numpy() 50 | np.save(prefix + "feature_mean", np_mean) 51 | np.save(prefix + "feature_std", np_std) 52 | -------------------------------------------------------------------------------- /delta_defense.py: -------------------------------------------------------------------------------- 1 | import torch as ch 2 | import numpy as np 3 | from robustness.model_utils import make_and_restore_model 4 | from robustness.tools.helpers import save_checkpoint 5 | import sys 6 | 7 | import utils 8 | 9 | 10 | def chuck_inf_means(senses): 11 | chucked = [] 12 | for i in range(senses.shape[0]): 13 | x = senses[i] 14 | chucked.append(np.mean(x[x != np.inf])) 15 | return np.array(chucked) 16 | 17 | 18 | if __name__ == "__main__": 19 | 20 | model_arch = sys.argv[1] 21 | model_type = sys.argv[2] 22 | random_drop = sys.argv[3] == 'random' 23 | num_drop = int(sys.argv[4]) 24 | model_path = sys.argv[5] 25 | 26 | if random_drop not in ['random', 'most', 'least']: 27 | raise ValueError("Method of selecting neurons to drop not supported") 28 | 29 | constants = utils.CIFAR10() 30 | ds = constants.get_dataset() 31 | model_kwargs = { 32 | 'arch': model_arch, 33 | 'dataset': ds, 34 | 'resume_path': model_type 35 | } 36 | 37 | # Get scaled delta values 38 | senses = constants.get_deltas(model_type, model_arch) 39 | (mean, std) = constants.get_stats(model_type, model_arch) 40 | 41 | # Load model 42 | model, _ = make_and_restore_model(**model_kwargs) 43 | model.eval() 44 | 45 | print("Dropping %d out of %d neurons" % (num_drop, senses.shape[0])) 46 | 47 | # Random weight drop-out if negative factor 48 | if random_drop: 49 | print("Random drop-out!") 50 | worst_n = np.random.permutation(senses.shape[0])[:num_drop] 51 | else: 52 | # 99.7% interval 53 | threshold = mean + 3 * std 54 | 55 | # Only consider neurons with any hopes of attackking (delta within some sensible range) 56 | senses = utils.scaled_values(senses, mean, std) 57 | senses = chuck_inf_means(senses) 58 | 59 | if random_drop == 'most': 60 | worst_n = np.argsort(np.abs(senses))[:num_drop] 61 | else: 62 | worst_n = np.argsort(-np.abs(senses))[:num_drop] 63 | 64 | # Extract final weights matrix from model 65 | with ch.no_grad(): 66 | model.state_dict().get("module.model.classifier.weight")[:, worst_n] = 0 67 | 68 | # Save modified model 69 | sd_info = { 70 | 'model': model.state_dict(), 71 | 'epoch': 1 72 | } 73 | save_checkpoint(sd_info, False, model_path) 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # One Neuron to Fool Them All 2 | 3 | ## Prerequisites 4 | 5 | - Install [this](https://github.com/iamgroot42/robustness) fork of the robustness package `pip install -e robustness` 6 | - If you are going to run experiments for Imagenet, modify `IMAGENET_PATH` in `utils.py` accordingly 7 | - Download pretrained models and pre-computed statistics: 8 | `wget https://www.dropbox.com/s/rsxzw30fdmle2qu/data.tar.gz?dl=1 -O data.tar.gz` 9 | - Extract files 10 | `tar -xf data.tar.gz` 11 | 12 | ## Pre-Computing Statistics (skip if downloaded file above) 13 | 14 | ### Generating feature-wise statistics ($\mu$, $\sigma$) 15 | - Given any model and dataset, calculate the feature-wise mean and standard deviation across the dataset. Computes across training and validation data for CIFAR10, and only validation data for Imagenet 16 | - `python act_values.py ` 17 | 18 | ### Generating $\Delta(i,x)$ values 19 | - Given a model (assumes positive range of features, true for all architectures in codebase via ReLU) and dataset, computes the $\Delta(i,x)$ $\forall i,x$ and saves them for later use (generating attack seeds) 20 | - `python multiclass_deltas.py ` 21 | 22 | ## Neuron-sensitivity Attack 23 | 24 | ### Generating adversarial examples using sensitive neurons 25 | - Given a model and corresponding feature statistics and $\Delta(i,x)$ values (computed above), find adversarial seeds within specific perturbation budgets 26 | - `python optimal_impostor.py` 27 | 28 | ## Training for Sensitivity 29 | 30 | ### Training using proposed regularization term 31 | - Much faster than adversarial training 32 | - Logs $L_2$ PGD attack success rates on validation set while training (to monitor robustness) 33 | - `python sensitivity_training.py --output_dir ` 34 | 35 | ### Pruning neurons from trained model, based on sensitivity 36 | - Given a trained model and dataset, use $\Delta(i,x)$ values to identify and prune weights that correspond to specific features 37 | - Can prune `random` (randomly sample), `least` (zero out least sensitive first), or `most` (zero out most sensitive first) 38 | - Prune `N` neurons (from features layer) 39 | `python delta_defense.py = 0] 67 | best_delta = ch.argmin(ch.abs(valid_sensitivity)) 68 | best_sensitivity = valid_sensitivity[best_delta] 69 | best_sensitivity = best_sensitivity.cpu().numpy() 70 | sensitivities[i] = sensitivities.get(i, []) + [best_sensitivity] 71 | 72 | with open("%s.txt" % filename, 'w') as f: 73 | for i in range(n_features): 74 | floats_to_string = ",".join([str(x) for x in sensitivities[i]]) 75 | f.write(floats_to_string + "\n") 76 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch as ch 2 | import numpy as np 3 | from torchvision import transforms 4 | from robustness.model_utils import make_and_restore_model 5 | from robustness.datasets import GenericBinary, CIFAR, ImageNet 6 | from robustness.tools import folder 7 | from tqdm import tqdm 8 | import sys 9 | import os 10 | 11 | 12 | IMAGENET_PATH = "" 13 | 14 | 15 | class DataPaths: 16 | def __init__(self, name, data_path, stats_path): 17 | self.name = name 18 | self.data_path = data_path 19 | self.dataset = self.dataset_type(data_path) 20 | self.models = {'nat': None, 'l1': None, 'l2': None, 'linf': None} 21 | self.model_prefix = {} 22 | self.stats_path = stats_path 23 | 24 | def get_dataset(self): 25 | return self.dataset 26 | 27 | def get_model(self, m_type, arch='resnet50'): 28 | model_path = self.models.get(m_type, None) 29 | if not model_path: 30 | model_path = m_type 31 | else: 32 | model_path = os.path.join(self.model_prefix[arch], self.models[m_type]) 33 | model_kwargs = { 34 | 'arch': arch, 35 | 'dataset': self.dataset, 36 | 'resume_path': model_path 37 | } 38 | model, _ = make_and_restore_model(**model_kwargs) 39 | model.eval() 40 | return model 41 | 42 | def get_stats(self, m_type, arch='resnet50'): 43 | stats_path = os.path.join(self.stats_path, arch, m_type, "stats") 44 | return get_stats(stats_path) 45 | 46 | def get_deltas(self, m_type, arch='resnet50'): 47 | deltas_path = os.path.join(self.stats_path, arch, m_type, "deltas.txt") 48 | return get_sensitivities(deltas_path) 49 | 50 | 51 | class CIFAR10(DataPaths): 52 | def __init__(self): 53 | self.dataset_type = CIFAR 54 | super(CIFAR10, self).__init__('cifar10', "/tmp/cifar10", "./cifar10/stats") 55 | self.model_prefix['resnet50'] = "cifar10/models/resnet50/" 56 | self.model_prefix['densenet169'] = "cifar10/models/densenet169/" 57 | self.model_prefix['vgg19'] = "cifar10/models/vgg19/" 58 | self.models['nat'] = "cifar_nat.pt" 59 | self.models['sense'] = "cifar_sense.pt" 60 | self.models['linf'] = "cifar_linf_8.pt" 61 | self.models['l2'] = "cifar_l2_0_5.pt" 62 | 63 | 64 | class ImageNet1000(DataPaths): 65 | def __init__(self): 66 | self.dataset_type = ImageNet 67 | super(ImageNet1000, self).__init__('imagenet1000', 68 | IMAGENET_PATH, "imagenet/stats/") 69 | self.model_prefix['resnet50'] = "imagenet/models/resnet50/" 70 | self.models['nat'] = "imagenet_nat.pt" 71 | self.models['l2'] = "imagenet_l2_3_0.pt" 72 | self.models['linf'] = "imagenet_linf_4.pt" 73 | 74 | 75 | def scaled_values(val, mean, std, eps=1e-10): 76 | return (val - np.repeat(np.expand_dims(mean, 1), val.shape[1], axis=1)) / (np.expand_dims(std, 1) + eps) 77 | 78 | 79 | def load_all_data(ds): 80 | batch_size = 512 81 | _, test_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True, fixed_test_order=True) 82 | 83 | images, labels = [], [] 84 | for (image, label) in test_loader: 85 | images.append(image) 86 | labels.append(label) 87 | labels = ch.cat(labels).cpu() 88 | images = ch.cat(images).cpu() 89 | return (images, labels) 90 | 91 | 92 | def get_sensitivities(path): 93 | features = [] 94 | with open(path, 'r') as f: 95 | for line in tqdm(f): 96 | values = np.array([float(x) for x in line.rstrip('\n').split(',')]) 97 | features.append(values) 98 | return np.array(features) 99 | 100 | 101 | def best_target_image(mat, which=0): 102 | sum_m = [] 103 | for i in range(mat.shape[1]): 104 | mat_interest = mat[mat[:, i] != np.inf, i] 105 | sum_m.append(np.average(np.abs(mat_interest))) 106 | best = np.argsort(sum_m) 107 | return best[which] 108 | 109 | 110 | def get_statistics(diff): 111 | l1_norms = ch.sum(ch.abs(diff), dim=1) 112 | l2_norms = ch.norm(diff, dim=1) 113 | linf_norms = ch.max(ch.abs(diff), dim=1)[0] 114 | return (l1_norms, l2_norms, linf_norms) 115 | 116 | 117 | def get_stats(base_path): 118 | mean = np.load(os.path.join(base_path, "feature_mean.npy")) 119 | std = np.load(os.path.join(base_path, "feature_std.npy")) 120 | return mean, std 121 | 122 | 123 | def get_logits_layer_name(arch): 124 | if "vgg" in arch: 125 | return "module.model.classifier.weight" 126 | elif "resnet" in arch: 127 | return "module.model.fc.weight" 128 | elif "densenet" in arch: 129 | return "module.model.linear.weight" 130 | return None 131 | -------------------------------------------------------------------------------- /sensitivity_training.py: -------------------------------------------------------------------------------- 1 | import torch as ch 2 | import numpy as np 3 | from robustness.train import train_model 4 | from robustness.tools import helpers 5 | from robustness import defaults 6 | from robustness.defaults import check_and_fill_args 7 | from robustness.model_utils import make_and_restore_model 8 | from robustness.datasets import DATASETS 9 | import os 10 | from itertools import combinations 11 | import cox 12 | import utils 13 | import argparse 14 | 15 | 16 | def regularization_term(model, inp, targets, top_k, delta_1, delta_2, train_criterion, adv, attack_kwargs): 17 | (logits, features), final_inp = model(inp, target=targets, make_adv=adv, with_latent=True, **attack_kwargs) 18 | w = model.module.model.classifier.weight 19 | 20 | # Calculate normal classification loss 21 | loss = train_criterion(logits, targets) 22 | 23 | # First term : minimize weight values for same feature across any two different classes (nC2) 24 | diffs = [] 25 | for c in combinations(range(logits.shape[1]), 2): 26 | # Across all possible (i, j) class pairs 27 | diff = w[c, :] 28 | # Note differences in weight values for same feature, different classes 29 | topk_diff, _ = ch.topk(ch.abs(diff[0] - diff[1]), top_k) 30 | diffs.append(ch.mean(topk_diff)) 31 | first_term = ch.max(ch.stack(diffs, dim=0)) 32 | 33 | diffs_2 = [] 34 | features_norm = ch.sum(features, dim=1).unsqueeze(1) 35 | diff_2_1 = ch.stack([w[y, :] for y in targets], dim=0) 36 | # Iterate over classes 37 | for i in range(logits.shape[1]): 38 | diff_2_2 = w[i, :].unsqueeze(0) 39 | normalized_drop_term = ch.abs(features * (diff_2_1 - diff_2_2) / features_norm) 40 | use_these, _ = ch.topk(normalized_drop_term, top_k, dim=1) 41 | use_these = ch.mean(use_these, dim=1) 42 | diffs_2.append(use_these) 43 | second_term = ch.mean(ch.stack(diffs_2, dim=0), dim=0) 44 | second_term = ch.mean(second_term) 45 | 46 | return ((logits, features), final_inp, loss, delta_1 * first_term + delta_2 * second_term) 47 | 48 | 49 | if __name__ == "__main__": 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument('--top_k', type=int, default=16, help='top-k (neurons) considered while calculating loss terms') 52 | parser.add_argument('--start_lr', type=float, default=1e-2, help='starting LR for optimizer') 53 | parser.add_argument('--delta_1', type=float, default=1e1, help='loss coefficient for first term') 54 | parser.add_argument('--delta_2', type=float, default=1e2, help='loss coefficient for second term') 55 | parser.add_argument('--batch_size', type=int, default=128, help='Batch Size') 56 | parser.add_argument('--output_dir', type=str, default='', help='path where model is to be saved') 57 | 58 | parsed_args = parser.parse_args() 59 | for arg in vars(parsed_args): 60 | print(arg, " : ", getattr(parsed_args, arg)) 61 | 62 | def regularizer(model, inp, targets, train_criterion, adv, attack_kwargs): 63 | return regularization_term(model, inp, targets, parsed_args.top_k, parsed_args.delta_1, 64 | parsed_args.delta_2, train_criterion, adv, attack_kwargs) 65 | 66 | if not os.path.exists(parsed_args.output_dir): 67 | raise ValueError("Please provide valid save dir for model") 68 | 69 | train_kwargs = { 70 | 'out_dir': parsed_args.output_dir, 71 | 'adv_train': 0, 72 | 'exp_name': 'sensitivity_training', 73 | 'dataset': 'cifar', 74 | 'arch': 'vgg19', 75 | 'adv_eval': True, 76 | 'batch_size': parsed_args.batch_size, 77 | # Validation-evaluation using PGD-L2 attack (to track L2 PGD perturbation robustness) 78 | 'attack_lr': (2.5 * 0.5) / 10, 79 | 'constraint': '2', 80 | 'eps': 0.5, 81 | 'attack_steps': 20, 82 | 'use_best': True, 83 | 'eps_fadein_epochs': 0, 84 | 'random_restarts': 0, 85 | 'lr': parsed_args.start_lr, 86 | 'use_adv_eval_criteria': 1, 87 | 'regularizer': regularizer, 88 | 'let_reg_handle_loss': True 89 | } 90 | 91 | ds_class = DATASETS[train_kwargs['dataset']] 92 | 93 | train_args = cox.utils.Parameters(train_kwargs) 94 | 95 | dx = utils.CIFAR10() 96 | dataset = dx.get_dataset() 97 | 98 | args = check_and_fill_args(train_args, defaults.TRAINING_ARGS, ds_class) 99 | args = check_and_fill_args(train_args, defaults.MODEL_LOADER_ARGS, ds_class) 100 | 101 | model, _ = make_and_restore_model(arch='vgg19', dataset=dataset) 102 | 103 | # Make the data loaders 104 | train_loader, val_loader = dataset.make_loaders(args.workers, args.batch_size, data_aug=bool(args.data_aug)) 105 | 106 | # Prefetches data to improve performance 107 | train_loader = helpers.DataPrefetcher(train_loader) 108 | val_loader = helpers.DataPrefetcher(val_loader) 109 | 110 | store = cox.store.Store(args.out_dir, args.exp_name) 111 | args_dict = args.as_dict() if isinstance(args, cox.utils.Parameters) else vars(args) 112 | schema = cox.store.schema_from_dict(args_dict) 113 | store.add_table('metadata', schema) 114 | store['metadata'].append_row(args_dict) 115 | 116 | model = train_model(args, model, (train_loader, val_loader), store=store) 117 | -------------------------------------------------------------------------------- /optimal_impostor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as ch 3 | from robustness.model_utils import make_and_restore_model 4 | from robustness.tools.vis_tools import show_image_row 5 | import numpy as np 6 | import sys 7 | from tqdm import tqdm 8 | from torch.autograd import Variable 9 | 10 | import utils 11 | 12 | 13 | def pgd_optimization(model, inp_og, target_rep, indices_mask, eps, random_restart_targets, iters=100, 14 | reg_weight=1e0, p='2', verbose=True, custom_best=False, fake_relu=True, random_restarts=0): 15 | # Modified inversion loss that puts emphasis on non-matching neurons to have similar activations 16 | def custom_inversion_loss(m, inp, targ): 17 | output, rep = m(inp, with_latent=True, fake_relu=fake_relu) 18 | # Normalized L2 error w.r.t. the target representation 19 | loss = ch.div(ch.norm(rep - targ, dim=1), ch.norm(targ, dim=1)) 20 | # Extra loss term (normalized) 21 | aux_loss = ch.sum(ch.abs((rep - targ) * indices_mask), dim=1) 22 | aux_loss = ch.div(aux_loss, ch.norm(targ * indices_mask, dim=1)) 23 | # Lagrangian formulation: 24 | return loss + reg_weight * aux_loss, output 25 | 26 | if custom_best: 27 | # If True, use the 'only neuron i' based 'best' evaluation 28 | if custom_best is True: 29 | def custom_loss_fn(loss, x): 30 | # Check how much beyond minimum delta the perturbation on i^th index is 31 | # Negative sign, since we want higher delta-diff to score better 32 | (_, rep), _ = model(x, with_latent=True, fake_relu=fake_relu) 33 | return - ch.sum((rep - target_rep) * indices_mask, dim=1) 34 | custom_best = custom_loss_fn 35 | # Else, expect custom_best function to be passed along 36 | else: 37 | # If nothing passed along, use simple comparison 38 | custom_best = None 39 | 40 | 41 | kwargs = { 42 | 'custom_loss': custom_inversion_loss, 43 | 'constraint': p, 44 | 'eps': eps, 45 | 'step_size': 2.5 * eps / iters, 46 | 'iterations': iters, 47 | 'targeted': True, 48 | 'do_tqdm': verbose, 49 | 'custom_best': custom_best, 50 | 'random_restarts': random_restarts, 51 | 'random_restart_targets': random_restart_targets 52 | } 53 | _, im_matched = model(inp_og, target_rep, make_adv=True, **kwargs) 54 | return im_matched 55 | 56 | 57 | def find_impostors(model, delta_values, ds, images, mean, std, 58 | verbose=True, n=4, eps=2.0, iters=200, 59 | norm='2', custom_best=False, fake_relu=True, 60 | analysis_start=0, random_restarts=0, delta_analysis=False): 61 | image_ = [] 62 | # Get target images 63 | for image in images: 64 | targ_img = image.unsqueeze(0) 65 | real = targ_img.repeat(n, 1, 1, 1) 66 | image_.append(real) 67 | real = ch.cat(image_, 0).cuda() 68 | 69 | # Get scaled senses 70 | scaled_delta_values = utils.scaled_values(delta_values, mean, std, eps=0) 71 | # Replace inf values with largest non-inf values 72 | delta_values[delta_values == np.inf] = delta_values[delta_values != np.inf].max() 73 | 74 | # Pick easiest-to-attack neurons per image 75 | easiest = np.argsort(scaled_delta_values, axis=0) 76 | 77 | # Get feature representation of current image 78 | with ch.no_grad(): 79 | (_, image_rep), _ = model(real, with_latent=True) 80 | 81 | # Construct delta vector and indices mask 82 | delta_vec = ch.zeros_like(image_rep) 83 | indices_mask = ch.zeros_like(image_rep) 84 | for j in range(len(images)): 85 | for i, x in enumerate(easiest[analysis_start : analysis_start + n, j]): 86 | delta_vec[i + j * n, x] = delta_values[x, j] 87 | indices_mask[i + j * n, x] = 1 88 | 89 | impostors = parallel_impostor(model, delta_vec, real, indices_mask, verbose, 90 | eps, iters, norm, custom_best, fake_relu, random_restarts) 91 | 92 | with ch.no_grad(): 93 | if delta_analysis: 94 | (pred, latent), _ = model(impostors, with_latent=True) 95 | else: 96 | pred, _ = model(impostors) 97 | latent = None 98 | label_pred = ch.argmax(pred, dim=1) 99 | 100 | clean_pred, _ = model(real) 101 | clean_pred = ch.argmax(clean_pred, dim=1) 102 | 103 | clean_preds = clean_pred.cpu().numpy() 104 | preds = label_pred.cpu().numpy() 105 | 106 | succeeded = [[] for _ in range(len(images))] 107 | if delta_analysis: 108 | delta_succeeded = [[] for _ in range(len(images))] 109 | for i in range(len(images)): 110 | for j in range(n): 111 | succeeded[i].append(preds[i * n + j] != clean_preds[i * n + j]) 112 | if delta_analysis: 113 | analysis_index = easiest[analysis_start : analysis_start + n, i][j] 114 | success_criterion = (latent[i * n + j] >= (image_rep[i * n + j] + delta_vec[i * n + j])) 115 | delta_succeeded[i].append(success_criterion[analysis_index].cpu().item()) 116 | succeeded = np.array(succeeded) 117 | if delta_analysis: 118 | delta_succeeded = np.array(delta_succeeded, 'float') 119 | image_labels = [clean_preds, preds] 120 | 121 | if not delta_analysis: 122 | delta_succeeded = None 123 | 124 | return (image_labels, succeeded, None, delta_succeeded) 125 | 126 | 127 | def parallel_impostor(model, delta_vec, im, indices_mask, verbose, eps, 128 | iters, norm, custom_best, fake_relu, random_restarts): 129 | # Get feature representation of current image 130 | with ch.no_grad(): 131 | (target_logits, image_rep), _ = model(im, with_latent=True, fake_relu=fake_relu) 132 | target_logits = ch.argmax(target_logits, dim=1) 133 | 134 | # Get target feature rep 135 | target_rep = image_rep + delta_vec 136 | 137 | # Override custom_best, use cross-entropy on model instead 138 | criterion = ch.nn.CrossEntropyLoss(reduction='none').cuda() 139 | def ce_loss(loss, x): 140 | output, _ = model(x, fake_relu=fake_relu) 141 | # We want CE loss b/w new and old to be as high as possible 142 | return -criterion(output, target_logits) 143 | # Use CE loss 144 | if custom_best: custom_best = ce_loss 145 | 146 | im_matched = pgd_optimization(model, im, target_rep, indices_mask, 147 | random_restart_targets=target_logits, eps=eps, iters=iters, verbose=verbose, 148 | p=norm, reg_weight=1e1, custom_best=custom_best, fake_relu=fake_relu, 149 | random_restarts=random_restarts) 150 | 151 | return im_matched 152 | 153 | 154 | if __name__ == "__main__": 155 | import argparse 156 | parser = argparse.ArgumentParser() 157 | parser.add_argument('--model_arch', type=str, default='vgg19', help='arch of model (resnet50/vgg19/desnetnet169)') 158 | parser.add_argument('--model_type', type=str, default='nat', help='type of model (nat/l2/linf)') 159 | parser.add_argument('--eps', type=float, default=0.5, help='epsilon-iter') 160 | parser.add_argument('--iters', type=int, default=50, help='number of iterations') 161 | parser.add_argument('--n', type=int, default=16, help='number of neurons per image') 162 | parser.add_argument('--bs', type=int, default=4, help='batch size while performing attack') 163 | parser.add_argument('--custom_best', type=bool, default=True, help='look at absoltue loss or perturbation for best-loss criteria') 164 | parser.add_argument('--dataset', type=str, default='cifar10', help='dataset: one of [cifar10, imagenet]') 165 | parser.add_argument('--norm', type=str, default='2', help='P-norm to limit budget of adversary') 166 | parser.add_argument('--analysis', type=bool, default=False, help='report neuron-wise attack success rates?') 167 | parser.add_argument('--delta_analysis', type=bool, default=False, help='report neuron-wise delta-achieve rates?') 168 | parser.add_argument('--random_restarts', type=int, default=0, help='how many random restarts? (0 -> False)') 169 | parser.add_argument('--analysis_start', type=int, default=0, help='index to start from (to capture n). used only when analysis flag is set') 170 | 171 | args = parser.parse_args() 172 | for arg in vars(args): 173 | print(arg, " : ", getattr(args, arg)) 174 | 175 | model_arch = args.model_arch 176 | model_type = args.model_type 177 | batch_size = args.bs 178 | iters = args.iters 179 | eps = args.eps 180 | n = args.n 181 | norm = args.norm 182 | custom_best = args.custom_best 183 | fake_relu = (model_arch != 'vgg19') 184 | analysis = args.analysis 185 | delta_analysis = args.delta_analysis 186 | analysis_start = args.analysis_start 187 | random_restarts = args.random_restarts 188 | 189 | # Load model 190 | if args.dataset == 'cifar10': 191 | constants = utils.CIFAR10() 192 | elif args.dataset == 'imagenet': 193 | constants = utils.ImageNet1000() 194 | else: 195 | print("Invalid Dataset Specified") 196 | ds = constants.get_dataset() 197 | 198 | # Load model 199 | model = constants.get_model(model_type , model_arch) 200 | # Get stats for neuron activations 201 | senses = constants.get_deltas(model_type, model_arch) 202 | (mean, std) = constants.get_stats(model_type, model_arch) 203 | 204 | _, test_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True, shuffle_val=False) 205 | 206 | index_base, avg_successes = 0, 0 207 | attack_rates = [0, 0, 0, 0] 208 | impostors_latents = [] 209 | all_impostors = [] 210 | neuron_wise_success = [] 211 | delta_wise_success = [] 212 | iterator = tqdm(test_loader) 213 | for (image, _) in iterator: 214 | picked_indices = list(range(index_base, index_base + len(image))) 215 | (image_labels, succeeded, impostors_latent, delta_succeeded) = find_impostors(model, senses[:, picked_indices], ds, 216 | image.cpu(), mean, std, n=n, verbose=False, 217 | eps=eps, iters=iters, norm=norm, 218 | custom_best=custom_best, fake_relu=fake_relu, 219 | analysis_start=analysis_start, random_restarts=random_restarts, 220 | delta_analysis=delta_analysis) 221 | 222 | attack_rates[0] += np.sum(np.sum(succeeded[:, :1], axis=1) > 0) 223 | attack_rates[1] += np.sum(np.sum(succeeded[:, :4], axis=1) > 0) 224 | attack_rates[2] += np.sum(np.sum(succeeded[:, :8], axis=1) > 0) 225 | num_flips = np.sum(succeeded, axis=1) 226 | attack_rates[3] += np.sum(num_flips > 0) 227 | avg_successes += np.sum(num_flips) 228 | index_base += len(image) 229 | # Keep track of attack success rate 230 | iterator.set_description('(n=1,4,8,%d) Success rates : (%.2f, %.2f, %.2f, %.2f) | | Flips/Image : %.2f/%d' \ 231 | % (n, 100 * attack_rates[0]/index_base, 232 | 100 * attack_rates[1]/index_base, 233 | 100 * attack_rates[2]/index_base, 234 | 100 * attack_rates[3]/index_base, 235 | avg_successes / index_base, n)) 236 | # Keep track of neuron-wise attack success rate 237 | if analysis: 238 | neuron_wise_success.append(succeeded) 239 | if delta_analysis: 240 | delta_wise_success.append(delta_succeeded) 241 | 242 | if analysis: 243 | neuron_wise_success = np.concatenate(neuron_wise_success, 0) 244 | neuron_wise_success = np.mean(neuron_wise_success, 0) 245 | for i in range(neuron_wise_success.shape[0]): 246 | print("Neuron %d attack success rate : %f %%" % (i + analysis_start, 100 * neuron_wise_success[i])) 247 | print() 248 | 249 | if delta_analysis: 250 | delta_wise_success = np.concatenate(delta_wise_success, 0) 251 | delta_wise_success = np.mean(delta_wise_success, 0) 252 | for i in range(delta_wise_success.shape[0]): 253 | print("Neuron %d acheiving-delta success rate : %f %%" % (i + analysis_start, 100 * delta_wise_success[i])) 254 | print() 255 | 256 | print("Attack success rate : %f %%" % (100 * attack_rates[-1]/index_base)) 257 | print("Average flips per image : %f/%d" % (avg_successes / index_base, n)) 258 | --------------------------------------------------------------------------------