├── LICENSE
├── act_values.py
├── delta_defense.py
├── README.md
├── .gitignore
├── multiclass_deltas.py
├── utils.py
├── sensitivity_training.py
└── optimal_impostor.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Anshuman Suri
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/act_values.py:
--------------------------------------------------------------------------------
 1 | import torch as ch
 2 | import utils
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 | 	import sys
 9 | 
10 | 	model_arch   = sys.argv[1]
11 | 	model_type   = sys.argv[2]
12 | 	prefix       = sys.argv[3]
13 | 	dataset      = sys.argv[4]
14 | 
15 | 	if dataset == 'cifar10':
16 | 		dx = utils.CIFAR10()
17 | 	elif dataset == 'imagenet':
18 | 		dx = utils.ImageNet1000()
19 | 	else:
20 | 		raise ValueError("Dataset not supported")
21 | 
22 | 	ds = dx.get_dataset()
23 | 	model = dx.get_model(model_type, model_arch)
24 | 
25 | 	batch_size = 128
26 | 	all_reps = []
27 | 	train_loader = None
28 | 	if dataset == 'cifar10':
29 | 		train_loader, val_loader = ds.make_loaders(batch_size=batch_size, workers=8)
30 | 	else:
31 | 		_, val_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True)
32 | 
33 | 	def get_reps(data_loader):
34 | 		for (im, label) in tqdm(data_loader):
35 | 			with ch.no_grad():
36 | 				(_, rep), _ = model(im.cuda(), with_latent=True)
37 | 				all_reps.append(rep.cpu())
38 | 
39 | 	if train_loader:
40 | 		get_reps(train_loader)
41 | 	get_reps(val_loader)
42 | 
43 | 	all_reps = ch.cat(all_reps)
44 | 	ch_mean  = ch.mean(all_reps, dim=0)
45 | 	ch_std   = ch.std(all_reps, dim=0)
46 | 
47 | 	# Dump mean, std vectors for later use:
48 | 	np_mean = ch_mean.cpu().numpy()
49 | 	np_std  = ch_std.cpu().numpy()
50 | 	np.save(prefix + "feature_mean", np_mean)
51 | 	np.save(prefix + "feature_std",   np_std)
52 | 


--------------------------------------------------------------------------------
/delta_defense.py:
--------------------------------------------------------------------------------
 1 | import torch as ch
 2 | import numpy as np
 3 | from robustness.model_utils import make_and_restore_model
 4 | from robustness.tools.helpers import save_checkpoint
 5 | import sys
 6 | 
 7 | import utils
 8 | 
 9 | 
10 | def chuck_inf_means(senses):
11 | 	chucked = []
12 | 	for i in range(senses.shape[0]):
13 | 		x = senses[i]
14 | 		chucked.append(np.mean(x[x != np.inf]))
15 | 	return np.array(chucked)
16 | 
17 | 
18 | if __name__ == "__main__":
19 | 
20 | 	model_arch  = sys.argv[1]
21 | 	model_type  = sys.argv[2]
22 | 	random_drop = sys.argv[3] == 'random'
23 | 	num_drop    = int(sys.argv[4])
24 | 	model_path  = sys.argv[5]
25 | 
26 | 	if random_drop not in ['random', 'most', 'least']:
27 | 		raise ValueError("Method of selecting neurons to drop not supported")
28 | 
29 | 	constants = utils.CIFAR10()
30 | 	ds = constants.get_dataset()
31 | 	model_kwargs = {
32 | 		'arch': model_arch,
33 | 		'dataset': ds,
34 | 		'resume_path': model_type
35 | 	}
36 | 
37 | 	# Get scaled delta values
38 | 	senses = constants.get_deltas(model_type, model_arch)
39 | 	(mean, std) = constants.get_stats(model_type, model_arch)
40 | 
41 | 	# Load model
42 | 	model, _ = make_and_restore_model(**model_kwargs)
43 | 	model.eval()
44 | 	
45 | 	print("Dropping %d out of %d neurons" % (num_drop, senses.shape[0]))
46 | 
47 | 	# Random weight drop-out if negative factor
48 | 	if random_drop:
49 | 		print("Random drop-out!")
50 | 		worst_n = np.random.permutation(senses.shape[0])[:num_drop]
51 | 	else:
52 | 		# 99.7% interval
53 | 		threshold = mean + 3 * std
54 | 
55 | 		# Only consider neurons with any hopes of attackking (delta within some sensible range)
56 | 		senses = utils.scaled_values(senses, mean, std)
57 | 		senses = chuck_inf_means(senses)
58 | 
59 | 		if random_drop == 'most':
60 | 			worst_n = np.argsort(np.abs(senses))[:num_drop]
61 | 		else:
62 | 			worst_n = np.argsort(-np.abs(senses))[:num_drop]
63 | 
64 | 	# Extract final weights matrix from model
65 | 	with ch.no_grad():
66 | 		model.state_dict().get("module.model.classifier.weight")[:, worst_n] = 0
67 | 
68 | 	# Save modified model
69 | 	sd_info = {
70 | 		'model': model.state_dict(),
71 | 		'epoch': 1
72 | 	}
73 | 	save_checkpoint(sd_info, False, model_path)
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # One Neuron to Fool Them All
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | - Install [this](https://github.com/iamgroot42/robustness) fork of the robustness package `pip install -e robustness`
 6 | - If you are going to run experiments for Imagenet, modify `IMAGENET_PATH` in `utils.py` accordingly
 7 | - Download pretrained models and pre-computed statistics:
 8 |   `wget https://www.dropbox.com/s/rsxzw30fdmle2qu/data.tar.gz?dl=1 -O data.tar.gz`
 9 | - Extract files
10 |   `tar -xf data.tar.gz`
11 | 
12 | ## Pre-Computing Statistics (skip if downloaded file above)
13 | 
14 | ### Generating feature-wise statistics ($\mu$, $\sigma$)
15 | - Given any model and dataset, calculate the feature-wise mean and standard deviation across the dataset. Computes across training and validation data for CIFAR10, and only validation data for Imagenet
16 | - `python act_values.py <model_arch> <model_type/path> <output_prefix_path_> <cifar10/imagenet>`
17 | 
18 | ### Generating $\Delta(i,x)$ values
19 | - Given a model (assumes positive range of features, true for all architectures in codebase via ReLU) and dataset, computes the $\Delta(i,x)$ $\forall i,x$ and saves them for later use (generating attack seeds)
20 | - `python multiclass_deltas.py <model_arch> <model_type/path> <cifar10/imagenet> <output_file_path>`
21 | 
22 | ## Neuron-sensitivity Attack
23 | 
24 | ### Generating adversarial examples using sensitive neurons
25 | - Given a model and corresponding feature statistics and $\Delta(i,x)$ values (computed above), find adversarial seeds within specific perturbation budgets
26 | - `python optimal_impostor.py`
27 | 
28 | ## Training for Sensitivity
29 | 
30 | ### Training using proposed regularization term
31 | - Much faster than adversarial training
32 | - Logs $L_2$ PGD attack success rates on validation set while training (to monitor robustness)
33 | - `python sensitivity_training.py --output_dir <output_model_path>`
34 | 
35 | ### Pruning neurons from trained model, based on sensitivity
36 | - Given a trained model and dataset, use $\Delta(i,x)$ values to identify and prune weights that correspond to specific features
37 | - Can prune `random` (randomly sample), `least` (zero out least sensitive first), or `most` (zero out most sensitive first)
38 | - Prune `N` neurons (from features layer)
39 | `python delta_defense.py <model_arch> <model_type/path> <random/most/least> <N> <output_model_path`
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Dat aand model files
  2 | cifar10/*
  3 | data.tar.gz
  4 | imagenet/*
  5 | test/*
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 


--------------------------------------------------------------------------------
/multiclass_deltas.py:
--------------------------------------------------------------------------------
 1 | import torch as ch
 2 | import utils
 3 | from robustness.model_utils import make_and_restore_model
 4 | import numpy as np
 5 | import sys
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def classwise_closed_form_solutions(logits, weights):
10 | 	# Iterate through all possible classes, calculate flip probabilities
11 | 	actual_label = ch.argmax(logits)
12 | 	delta_values = logits[actual_label] - logits
13 | 	delta_values /= weights - weights[actual_label]
14 | 	delta_values[actual_label] = np.inf
15 | 	return delta_values
16 | 
17 | 
18 | if __name__ == "__main__":
19 | 	import sys
20 | 
21 | 	model_arch = sys.argv[1]
22 | 	model_type = sys.argv[2]
23 | 	dataset    = sys.argv[3]
24 | 	filename   = sys.argv[4]
25 | 
26 | 	if dataset == 'cifar10':
27 | 		dx = utils.CIFAR10()
28 | 		batch_size = 1024
29 | 	elif dataset == 'imagenet':
30 | 		batch_size = 256
31 | 		dx = utils.ImageNet1000()
32 | 	else:
33 | 		raise ValueError("Dataset not supported")
34 | 
35 | 	ds = dx.get_dataset()
36 | 	model = dx.get_model(model_type, model_arch)
37 | 
38 | 	_, test_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True, fixed_test_order=True)
39 | 
40 | 	weights_name = utils.get_logits_layer_name(model_arch)
41 | 	if not weights_name:
42 | 		raise ValueError("Architecture not supported yet")
43 | 
44 | 	# Extract final weights matrix from model
45 | 	weights = None
46 | 	for name, param in model.state_dict().items():
47 | 		if name == weights_name:
48 | 			weights = param
49 | 			break
50 | 
51 | 	n_features = weights.shape[1]
52 | 	sensitivities = {}
53 | 	# Get batches of data
54 | 	for (im, label) in test_loader:
55 | 		with ch.no_grad():
56 | 			(logits, features), _ = model(im.cuda(), with_latent=True)
57 | 		# For each data point in batch
58 | 		for j, logit in tqdm(enumerate(logits)):
59 | 			# For each feature
60 | 			for i in range(n_features):
61 | 				specific_weights = weights[:, i]
62 | 				# Get sensitivity values across classes
63 | 				sensitivity = classwise_closed_form_solutions(logit, specific_weights)
64 | 
65 | 				# Only consider delta values that correspond to valud ReLU range, register others as 'inf'
66 | 				valid_sensitivity = sensitivity[features[j][i] + sensitivity >= 0]
67 | 				best_delta = ch.argmin(ch.abs(valid_sensitivity))
68 | 				best_sensitivity = valid_sensitivity[best_delta]
69 | 				best_sensitivity = best_sensitivity.cpu().numpy()
70 | 				sensitivities[i] = sensitivities.get(i, []) + [best_sensitivity]
71 | 
72 | 	with open("%s.txt" % filename, 'w') as f:
73 | 		for i in range(n_features):
74 | 			floats_to_string = ",".join([str(x) for x in sensitivities[i]])
75 | 			f.write(floats_to_string + "\n")
76 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch as ch
  2 | import numpy as np
  3 | from torchvision import transforms
  4 | from robustness.model_utils import make_and_restore_model
  5 | from robustness.datasets import GenericBinary, CIFAR, ImageNet
  6 | from robustness.tools import folder
  7 | from tqdm import tqdm
  8 | import sys
  9 | import os
 10 | 
 11 | 
 12 | IMAGENET_PATH = ""
 13 | 
 14 | 
 15 | class DataPaths:
 16 | 	def __init__(self, name, data_path, stats_path):
 17 | 		self.name      = name
 18 | 		self.data_path = data_path
 19 | 		self.dataset   = self.dataset_type(data_path)
 20 | 		self.models = {'nat': None, 'l1': None, 'l2': None, 'linf': None}
 21 | 		self.model_prefix = {}
 22 | 		self.stats_path = stats_path
 23 | 
 24 | 	def get_dataset(self):
 25 | 		return self.dataset
 26 | 
 27 | 	def get_model(self, m_type, arch='resnet50'):
 28 | 		model_path = self.models.get(m_type, None)
 29 | 		if not model_path:
 30 | 			model_path = m_type
 31 | 		else:
 32 | 			model_path = os.path.join(self.model_prefix[arch], self.models[m_type])
 33 | 		model_kwargs = {
 34 | 			'arch': arch,
 35 | 			'dataset': self.dataset,
 36 | 			'resume_path': model_path
 37 | 		}
 38 | 		model, _ = make_and_restore_model(**model_kwargs)
 39 | 		model.eval()
 40 | 		return model
 41 | 
 42 | 	def get_stats(self, m_type, arch='resnet50'):
 43 | 		stats_path = os.path.join(self.stats_path, arch, m_type, "stats")
 44 | 		return get_stats(stats_path)
 45 | 
 46 | 	def get_deltas(self, m_type, arch='resnet50'):
 47 | 		deltas_path = os.path.join(self.stats_path, arch, m_type, "deltas.txt")
 48 | 		return get_sensitivities(deltas_path)
 49 | 
 50 | 
 51 | class CIFAR10(DataPaths):
 52 | 	def __init__(self):
 53 | 		self.dataset_type = CIFAR
 54 | 		super(CIFAR10, self).__init__('cifar10', "/tmp/cifar10", "./cifar10/stats")
 55 | 		self.model_prefix['resnet50'] = "cifar10/models/resnet50/"
 56 | 		self.model_prefix['densenet169'] = "cifar10/models/densenet169/"
 57 | 		self.model_prefix['vgg19'] = "cifar10/models/vgg19/"
 58 | 		self.models['nat']  = "cifar_nat.pt"
 59 | 		self.models['sense']  = "cifar_sense.pt"
 60 | 		self.models['linf'] = "cifar_linf_8.pt"
 61 | 		self.models['l2']   = "cifar_l2_0_5.pt"
 62 | 
 63 | 
 64 | class ImageNet1000(DataPaths):
 65 | 	def __init__(self):
 66 | 		self.dataset_type = ImageNet
 67 | 		super(ImageNet1000, self).__init__('imagenet1000',
 68 | 			IMAGENET_PATH, "imagenet/stats/")
 69 | 		self.model_prefix['resnet50'] = "imagenet/models/resnet50/"
 70 | 		self.models['nat']  = "imagenet_nat.pt"
 71 | 		self.models['l2']   = "imagenet_l2_3_0.pt"
 72 | 		self.models['linf'] = "imagenet_linf_4.pt"
 73 | 
 74 | 
 75 | def scaled_values(val, mean, std, eps=1e-10):
 76 | 	return (val - np.repeat(np.expand_dims(mean, 1), val.shape[1], axis=1)) / (np.expand_dims(std, 1) +  eps)
 77 | 
 78 | 
 79 | def load_all_data(ds):
 80 | 	batch_size = 512
 81 | 	_, test_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True, fixed_test_order=True)
 82 | 
 83 | 	images, labels = [], []
 84 | 	for (image, label) in test_loader:
 85 | 		images.append(image)
 86 | 		labels.append(label)
 87 | 	labels = ch.cat(labels).cpu()
 88 | 	images = ch.cat(images).cpu()
 89 | 	return (images, labels)
 90 | 
 91 | 
 92 | def get_sensitivities(path):
 93 | 	features = []
 94 | 	with open(path, 'r') as f:
 95 | 		for line in tqdm(f):
 96 | 			values = np.array([float(x) for x in line.rstrip('\n').split(',')])
 97 | 			features.append(values)
 98 | 	return np.array(features)
 99 | 
100 | 
101 | def best_target_image(mat, which=0):
102 | 	sum_m = []
103 | 	for i in range(mat.shape[1]):
104 | 		mat_interest = mat[mat[:, i] != np.inf, i]
105 | 		sum_m.append(np.average(np.abs(mat_interest)))
106 | 	best = np.argsort(sum_m)
107 | 	return best[which]
108 | 
109 | 
110 | def get_statistics(diff):
111 | 	l1_norms   = ch.sum(ch.abs(diff), dim=1)
112 | 	l2_norms   = ch.norm(diff, dim=1)
113 | 	linf_norms = ch.max(ch.abs(diff), dim=1)[0]
114 | 	return (l1_norms, l2_norms, linf_norms)
115 | 
116 | 
117 | def get_stats(base_path):
118 | 	mean = np.load(os.path.join(base_path, "feature_mean.npy"))
119 | 	std  = np.load(os.path.join(base_path, "feature_std.npy"))
120 | 	return mean, std
121 | 
122 | 
123 | def get_logits_layer_name(arch):
124 | 	if "vgg" in arch:
125 | 		return "module.model.classifier.weight"
126 | 	elif "resnet" in arch:
127 | 		return "module.model.fc.weight"
128 | 	elif "densenet" in arch:
129 | 		return "module.model.linear.weight"
130 | 	return None
131 | 


--------------------------------------------------------------------------------
/sensitivity_training.py:
--------------------------------------------------------------------------------
  1 | import torch as ch
  2 | import numpy as np
  3 | from robustness.train import train_model
  4 | from robustness.tools import helpers
  5 | from robustness import defaults
  6 | from robustness.defaults import check_and_fill_args
  7 | from robustness.model_utils import make_and_restore_model
  8 | from robustness.datasets import DATASETS
  9 | import os
 10 | from itertools import combinations 
 11 | import cox
 12 | import utils
 13 | import argparse
 14 | 
 15 | 
 16 | def regularization_term(model, inp, targets, top_k, delta_1, delta_2, train_criterion, adv, attack_kwargs):
 17 | 	(logits, features), final_inp = model(inp, target=targets, make_adv=adv, with_latent=True, **attack_kwargs)
 18 | 	w = model.module.model.classifier.weight
 19 | 
 20 | 	# Calculate normal classification loss
 21 | 	loss = train_criterion(logits, targets)
 22 | 	
 23 | 	# First term : minimize weight values for same feature across any two different classes (nC2)
 24 | 	diffs = []
 25 | 	for c in combinations(range(logits.shape[1]), 2):
 26 | 		# Across all possible (i, j) class pairs
 27 | 		diff = w[c, :]
 28 | 		# Note differences in weight values for same feature, different classes
 29 | 		topk_diff, _ = ch.topk(ch.abs(diff[0] - diff[1]), top_k)
 30 | 		diffs.append(ch.mean(topk_diff))
 31 | 	first_term = ch.max(ch.stack(diffs, dim=0))
 32 | 
 33 | 	diffs_2 = []
 34 | 	features_norm = ch.sum(features, dim=1).unsqueeze(1)
 35 | 	diff_2_1 = ch.stack([w[y, :] for y in targets], dim=0)
 36 | 	# Iterate over classes
 37 | 	for i in range(logits.shape[1]):
 38 | 		diff_2_2 = w[i, :].unsqueeze(0)
 39 | 		normalized_drop_term = ch.abs(features * (diff_2_1 - diff_2_2) / features_norm)
 40 | 		use_these, _ = ch.topk(normalized_drop_term, top_k, dim=1)
 41 | 		use_these = ch.mean(use_these, dim=1)
 42 | 		diffs_2.append(use_these)
 43 | 	second_term = ch.mean(ch.stack(diffs_2, dim=0), dim=0)
 44 | 	second_term = ch.mean(second_term)
 45 | 
 46 | 	return ((logits, features), final_inp, loss, delta_1 * first_term + delta_2 * second_term)
 47 | 
 48 | 
 49 | if __name__ == "__main__":
 50 | 	parser = argparse.ArgumentParser()
 51 | 	parser.add_argument('--top_k', type=int, default=16, help='top-k (neurons) considered while calculating loss terms')
 52 | 	parser.add_argument('--start_lr', type=float, default=1e-2, help='starting LR for optimizer')
 53 | 	parser.add_argument('--delta_1', type=float, default=1e1, help='loss coefficient for first term')
 54 | 	parser.add_argument('--delta_2', type=float, default=1e2, help='loss coefficient for second term')
 55 | 	parser.add_argument('--batch_size', type=int, default=128, help='Batch Size')
 56 | 	parser.add_argument('--output_dir', type=str, default='', help='path where model is to be saved')
 57 | 
 58 | 	parsed_args = parser.parse_args()
 59 | 	for arg in vars(parsed_args):
 60 | 		print(arg, " : ", getattr(parsed_args, arg))
 61 | 
 62 | 	def regularizer(model, inp, targets, train_criterion, adv, attack_kwargs):
 63 | 		return regularization_term(model, inp, targets, parsed_args.top_k, parsed_args.delta_1,
 64 | 			parsed_args.delta_2, train_criterion, adv, attack_kwargs)
 65 | 
 66 | 	if not os.path.exists(parsed_args.output_dir):
 67 | 		raise ValueError("Please provide valid save dir for model")
 68 | 
 69 | 	train_kwargs = {
 70 | 	    'out_dir': parsed_args.output_dir,
 71 | 	    'adv_train': 0,
 72 | 	    'exp_name': 'sensitivity_training',
 73 | 	    'dataset': 'cifar',
 74 |     	'arch': 'vgg19',
 75 |     	'adv_eval': True,
 76 |     	'batch_size': parsed_args.batch_size,
 77 |     	# Validation-evaluation using PGD-L2 attack (to track L2 PGD perturbation robustness)
 78 |     	'attack_lr': (2.5 * 0.5) / 10,
 79 |     	'constraint': '2',
 80 |     	'eps': 0.5,
 81 | 	    'attack_steps': 20,
 82 |     	'use_best': True,
 83 |     	'eps_fadein_epochs': 0,
 84 |     	'random_restarts': 0,
 85 |     	'lr': parsed_args.start_lr,
 86 |     	'use_adv_eval_criteria': 1,
 87 |     	'regularizer': regularizer,
 88 |     	'let_reg_handle_loss': True
 89 | 	}
 90 | 
 91 | 	ds_class = DATASETS[train_kwargs['dataset']]
 92 | 
 93 | 	train_args = cox.utils.Parameters(train_kwargs)
 94 | 
 95 | 	dx = utils.CIFAR10()
 96 | 	dataset = dx.get_dataset()
 97 | 
 98 | 	args = check_and_fill_args(train_args, defaults.TRAINING_ARGS, ds_class)
 99 | 	args = check_and_fill_args(train_args, defaults.MODEL_LOADER_ARGS, ds_class)
100 | 
101 | 	model, _ = make_and_restore_model(arch='vgg19', dataset=dataset)
102 | 	
103 | 	# Make the data loaders
104 | 	train_loader, val_loader = dataset.make_loaders(args.workers, args.batch_size, data_aug=bool(args.data_aug))
105 | 
106 | 	# Prefetches data to improve performance
107 | 	train_loader = helpers.DataPrefetcher(train_loader)
108 | 	val_loader = helpers.DataPrefetcher(val_loader)
109 | 
110 | 	store = cox.store.Store(args.out_dir, args.exp_name)
111 | 	args_dict = args.as_dict() if isinstance(args, cox.utils.Parameters) else vars(args)
112 | 	schema = cox.store.schema_from_dict(args_dict)
113 | 	store.add_table('metadata', schema)
114 | 	store['metadata'].append_row(args_dict)
115 | 
116 | 	model = train_model(args, model, (train_loader, val_loader), store=store)
117 | 


--------------------------------------------------------------------------------
/optimal_impostor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch as ch
  3 | from robustness.model_utils import make_and_restore_model
  4 | from robustness.tools.vis_tools import show_image_row
  5 | import numpy as np
  6 | import sys
  7 | from tqdm import tqdm
  8 | from torch.autograd import Variable
  9 | 
 10 | import utils
 11 | 
 12 | 
 13 | def pgd_optimization(model, inp_og, target_rep, indices_mask, eps, random_restart_targets, iters=100,
 14 | 	reg_weight=1e0, p='2', verbose=True, custom_best=False, fake_relu=True, random_restarts=0):
 15 | 	# Modified inversion loss that puts emphasis on non-matching neurons to have similar activations
 16 | 	def custom_inversion_loss(m, inp, targ):
 17 | 		output, rep = m(inp, with_latent=True, fake_relu=fake_relu)
 18 | 		# Normalized L2 error w.r.t. the target representation
 19 | 		loss = ch.div(ch.norm(rep - targ, dim=1), ch.norm(targ, dim=1))
 20 | 		# Extra loss term (normalized)
 21 | 		aux_loss = ch.sum(ch.abs((rep - targ) * indices_mask), dim=1)
 22 | 		aux_loss = ch.div(aux_loss, ch.norm(targ * indices_mask, dim=1))
 23 | 		# Lagrangian formulation:
 24 | 		return loss + reg_weight * aux_loss, output
 25 | 
 26 | 	if custom_best:
 27 | 		# If True, use the 'only neuron i' based 'best' evaluation
 28 | 		if custom_best is True:
 29 | 			def custom_loss_fn(loss, x):
 30 | 				# Check how much beyond minimum delta the  perturbation on i^th index is
 31 | 				# Negative sign, since we want higher delta-diff to score better
 32 | 				(_, rep), _ = model(x, with_latent=True, fake_relu=fake_relu)
 33 | 				return - ch.sum((rep - target_rep) * indices_mask, dim=1)
 34 | 			custom_best = custom_loss_fn
 35 | 		# Else, expect custom_best function to be passed along
 36 | 	else:
 37 | 		# If nothing passed along, use simple comparison
 38 | 		custom_best = None
 39 | 
 40 | 
 41 | 	kwargs = {
 42 | 		'custom_loss': custom_inversion_loss,
 43 | 		'constraint': p,
 44 | 		'eps': eps,
 45 | 		'step_size': 2.5 * eps / iters,
 46 | 		'iterations': iters,
 47 | 		'targeted': True,
 48 | 		'do_tqdm': verbose,
 49 | 		'custom_best': custom_best,
 50 | 		'random_restarts': random_restarts,
 51 | 		'random_restart_targets': random_restart_targets
 52 | 	}
 53 | 	_, im_matched = model(inp_og, target_rep, make_adv=True, **kwargs)
 54 | 	return im_matched
 55 | 
 56 | 
 57 | def find_impostors(model, delta_values, ds, images, mean, std,
 58 | 	verbose=True, n=4, eps=2.0, iters=200,
 59 | 	norm='2', custom_best=False, fake_relu=True,
 60 | 	analysis_start=0, random_restarts=0, delta_analysis=False):
 61 | 	image_ = []
 62 | 	# Get target images
 63 | 	for image in images:
 64 | 		targ_img = image.unsqueeze(0)
 65 | 		real = targ_img.repeat(n, 1, 1, 1)
 66 | 		image_.append(real)
 67 | 	real = ch.cat(image_, 0).cuda()
 68 | 
 69 | 	# Get scaled senses
 70 | 	scaled_delta_values = utils.scaled_values(delta_values, mean, std, eps=0)
 71 | 	# Replace inf values with largest non-inf values
 72 | 	delta_values[delta_values == np.inf] = delta_values[delta_values != np.inf].max()
 73 | 
 74 | 	# Pick easiest-to-attack neurons per image
 75 | 	easiest = np.argsort(scaled_delta_values, axis=0)
 76 | 
 77 | 	# Get feature representation of current image
 78 | 	with ch.no_grad():
 79 | 		(_, image_rep), _  = model(real, with_latent=True)
 80 | 
 81 | 	# Construct delta vector and indices mask
 82 | 	delta_vec = ch.zeros_like(image_rep)
 83 | 	indices_mask = ch.zeros_like(image_rep)
 84 | 	for j in range(len(images)):
 85 | 		for i, x in enumerate(easiest[analysis_start : analysis_start + n, j]):
 86 | 			delta_vec[i + j * n, x] = delta_values[x, j]
 87 | 			indices_mask[i + j * n, x] = 1		
 88 | 
 89 | 	impostors = parallel_impostor(model, delta_vec, real, indices_mask, verbose,
 90 | 		eps, iters, norm, custom_best, fake_relu, random_restarts)
 91 | 
 92 | 	with ch.no_grad():
 93 | 		if delta_analysis:
 94 | 			(pred, latent), _ = model(impostors, with_latent=True)
 95 | 		else:
 96 | 			pred, _ = model(impostors)
 97 | 			latent = None
 98 | 	label_pred = ch.argmax(pred, dim=1)
 99 | 
100 | 	clean_pred, _ = model(real)
101 | 	clean_pred = ch.argmax(clean_pred, dim=1)
102 | 
103 | 	clean_preds = clean_pred.cpu().numpy()
104 | 	preds       = label_pred.cpu().numpy()
105 | 
106 | 	succeeded = [[] for _ in range(len(images))]
107 | 	if delta_analysis:
108 | 		delta_succeeded = [[] for _ in range(len(images))]
109 | 	for i in range(len(images)):
110 | 		for j in range(n):
111 | 			succeeded[i].append(preds[i * n + j] != clean_preds[i * n + j])
112 | 			if delta_analysis:
113 | 				analysis_index = easiest[analysis_start : analysis_start + n, i][j]
114 | 				success_criterion = (latent[i * n + j] >= (image_rep[i * n + j] + delta_vec[i * n + j]))
115 | 				delta_succeeded[i].append(success_criterion[analysis_index].cpu().item())
116 | 	succeeded = np.array(succeeded)
117 | 	if delta_analysis:
118 | 		delta_succeeded = np.array(delta_succeeded, 'float')
119 | 	image_labels = [clean_preds, preds]
120 | 
121 | 	if not delta_analysis:
122 | 		delta_succeeded = None
123 | 
124 | 	return (image_labels, succeeded, None, delta_succeeded)
125 | 
126 | 
127 | def parallel_impostor(model, delta_vec, im, indices_mask, verbose, eps,
128 | 	iters, norm, custom_best, fake_relu, random_restarts):
129 | 	# Get feature representation of current image
130 | 	with ch.no_grad():
131 | 		(target_logits, image_rep), _  = model(im, with_latent=True, fake_relu=fake_relu)
132 | 		target_logits = ch.argmax(target_logits, dim=1)
133 | 
134 | 	# Get target feature rep
135 | 	target_rep = image_rep + delta_vec
136 | 
137 | 	# Override custom_best, use cross-entropy on model instead
138 | 	criterion = ch.nn.CrossEntropyLoss(reduction='none').cuda()
139 | 	def ce_loss(loss, x):
140 | 		output, _ = model(x, fake_relu=fake_relu)
141 | 		# We want CE loss b/w new and old to be as high as possible
142 | 		return -criterion(output, target_logits)
143 | 	# Use CE loss
144 | 	if custom_best: custom_best = ce_loss
145 | 
146 | 	im_matched = pgd_optimization(model, im, target_rep, indices_mask,
147 | 		random_restart_targets=target_logits, eps=eps, iters=iters, verbose=verbose,
148 | 		p=norm, reg_weight=1e1, custom_best=custom_best, fake_relu=fake_relu,
149 | 		random_restarts=random_restarts)
150 | 	
151 | 	return im_matched
152 | 
153 | 
154 | if __name__ == "__main__":
155 | 	import argparse
156 | 	parser = argparse.ArgumentParser()
157 | 	parser.add_argument('--model_arch', type=str, default='vgg19', help='arch of model (resnet50/vgg19/desnetnet169)')
158 | 	parser.add_argument('--model_type', type=str, default='nat', help='type of model (nat/l2/linf)')
159 | 	parser.add_argument('--eps', type=float, default=0.5, help='epsilon-iter')
160 | 	parser.add_argument('--iters', type=int, default=50, help='number of iterations')
161 | 	parser.add_argument('--n', type=int, default=16, help='number of neurons per image')
162 | 	parser.add_argument('--bs', type=int, default=4, help='batch size while performing attack')
163 | 	parser.add_argument('--custom_best', type=bool, default=True, help='look at absoltue loss or perturbation for best-loss criteria')
164 | 	parser.add_argument('--dataset', type=str, default='cifar10', help='dataset: one of [cifar10, imagenet]')
165 | 	parser.add_argument('--norm', type=str, default='2', help='P-norm to limit budget of adversary')
166 | 	parser.add_argument('--analysis', type=bool, default=False, help='report neuron-wise attack success rates?')
167 | 	parser.add_argument('--delta_analysis', type=bool, default=False, help='report neuron-wise delta-achieve rates?')
168 | 	parser.add_argument('--random_restarts', type=int, default=0, help='how many random restarts? (0 -> False)')
169 | 	parser.add_argument('--analysis_start', type=int, default=0, help='index to start from (to capture n). used only when analysis flag is set')
170 | 	
171 | 	args = parser.parse_args()
172 | 	for arg in vars(args):
173 | 		print(arg, " : ", getattr(args, arg))
174 | 	
175 | 	model_arch      = args.model_arch
176 | 	model_type      = args.model_type
177 | 	batch_size      = args.bs
178 | 	iters           = args.iters
179 | 	eps             = args.eps
180 | 	n               = args.n
181 | 	norm            = args.norm
182 | 	custom_best     = args.custom_best
183 | 	fake_relu       = (model_arch != 'vgg19')
184 | 	analysis        = args.analysis
185 | 	delta_analysis  = args.delta_analysis
186 | 	analysis_start  = args.analysis_start
187 | 	random_restarts = args.random_restarts
188 | 
189 | 	# Load model
190 | 	if args.dataset == 'cifar10':
191 | 		constants = utils.CIFAR10()
192 | 	elif args.dataset == 'imagenet':
193 | 		constants = utils.ImageNet1000()
194 | 	else:
195 | 		print("Invalid Dataset Specified")
196 | 	ds = constants.get_dataset()
197 | 
198 | 	# Load model
199 | 	model = constants.get_model(model_type , model_arch)
200 | 	# Get stats for neuron activations
201 | 	senses = constants.get_deltas(model_type, model_arch)
202 | 	(mean, std) = constants.get_stats(model_type, model_arch)
203 | 
204 | 	_, test_loader = ds.make_loaders(batch_size=batch_size, workers=8, only_val=True, shuffle_val=False)
205 | 
206 | 	index_base, avg_successes = 0, 0
207 | 	attack_rates = [0, 0, 0, 0]
208 | 	impostors_latents = []
209 | 	all_impostors = []
210 | 	neuron_wise_success = []
211 | 	delta_wise_success  = []
212 | 	iterator = tqdm(test_loader)
213 | 	for (image, _) in iterator:
214 | 		picked_indices = list(range(index_base, index_base + len(image)))
215 | 		(image_labels, succeeded, impostors_latent, delta_succeeded) = find_impostors(model, senses[:, picked_indices], ds,
216 | 															image.cpu(), mean, std, n=n, verbose=False,
217 | 															eps=eps, iters=iters, norm=norm,
218 | 															custom_best=custom_best, fake_relu=fake_relu,
219 | 															analysis_start=analysis_start, random_restarts=random_restarts,
220 | 															delta_analysis=delta_analysis)
221 | 
222 | 		attack_rates[0] += np.sum(np.sum(succeeded[:, :1], axis=1) > 0)
223 | 		attack_rates[1] += np.sum(np.sum(succeeded[:, :4], axis=1) > 0)
224 | 		attack_rates[2] += np.sum(np.sum(succeeded[:, :8], axis=1) > 0)
225 | 		num_flips = np.sum(succeeded, axis=1)
226 | 		attack_rates[3] += np.sum(num_flips > 0)
227 | 		avg_successes += np.sum(num_flips)
228 | 		index_base += len(image)
229 | 		# Keep track of attack success rate
230 | 		iterator.set_description('(n=1,4,8,%d) Success rates : (%.2f, %.2f, %.2f, %.2f) | | Flips/Image : %.2f/%d' \
231 | 			% (n, 100 * attack_rates[0]/index_base,
232 | 				100 * attack_rates[1]/index_base,
233 | 				100 * attack_rates[2]/index_base,
234 | 				100 * attack_rates[3]/index_base,
235 | 				avg_successes / index_base, n))
236 | 		# Keep track of neuron-wise attack success rate
237 | 		if analysis:
238 | 			neuron_wise_success.append(succeeded)
239 | 		if delta_analysis:
240 | 			delta_wise_success.append(delta_succeeded)
241 | 
242 | 	if analysis:
243 | 		neuron_wise_success = np.concatenate(neuron_wise_success, 0)
244 | 		neuron_wise_success = np.mean(neuron_wise_success, 0)
245 | 		for i in range(neuron_wise_success.shape[0]):
246 | 			print("Neuron %d attack success rate : %f %%" % (i + analysis_start, 100 * neuron_wise_success[i]))
247 | 		print()
248 | 
249 | 	if delta_analysis:
250 | 		delta_wise_success = np.concatenate(delta_wise_success, 0)
251 | 		delta_wise_success = np.mean(delta_wise_success, 0)
252 | 		for i in range(delta_wise_success.shape[0]):
253 | 			print("Neuron %d acheiving-delta success rate : %f %%" % (i + analysis_start, 100 * delta_wise_success[i]))
254 | 		print()
255 | 
256 | 	print("Attack success rate : %f %%" % (100 * attack_rates[-1]/index_base))
257 | 	print("Average flips per image : %f/%d" % (avg_successes / index_base, n))
258 | 


--------------------------------------------------------------------------------