├── result.png ├── use_case ├── Noisy Label, Watermarking │ ├── models │ │ ├── __init__.py │ │ └── resnet.py │ ├── pytorch_fitmodule │ │ ├── __init__.py │ │ ├── utils.py │ │ └── fit_module.py │ ├── PlotRuntime.py │ ├── Label.py │ ├── Watermark.py │ ├── PlotAccuracy.py │ ├── PlotLabel.py │ ├── Poisoning.py │ ├── runtime.py │ ├── PlotPoisoning.py │ ├── shap_utils.py │ └── Shapley.py └── DataAcquisition │ ├── dknn.py │ ├── uci_knn.py │ ├── shap_utils.py │ ├── Shapley.py │ └── utils.py ├── .gitignore ├── README.md ├── exact_sp.py ├── exact_sp_example.py ├── LSH_sp_example.py ├── LSH_sp.py └── reproduction ├── Cifar10 └── accuracy.ipynb ├── YFCC100M └── testlsh.ipynb └── ImageNet └── accuracy.ipynb /result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI-secure/KNN-PVLDB/HEAD/result.png -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import * 2 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/pytorch_fitmodule/__init__.py: -------------------------------------------------------------------------------- 1 | from .fit_module import FitModule 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Example user template template 3 | ### Example user template 4 | 5 | # IntelliJ project files 6 | .idea 7 | *.iml 8 | out 9 | gen 10 | __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Valuation 2 | 3 | This repo is the official code base for PVLDB paper "Efficient task-specific data valuation for nearest neighbor algorithms". 4 | 5 | ----- 6 | 7 | It contains scripts to calculate exact Shapley value (in the `exact_sp.py`) and approximate Shapley value based on LSH (in the `LSH_sp.py`) for KNN classifier. 8 | 9 | We also provide two examples about how to calculate exact Shapley value (in the `exact_sp_example.py`) and approximate Shapley value (in the `LSH_sp_example.py`) on Cifar-10 dataset. 10 | 11 | In the reproduction folder, we provide our jupyter notebook scripts for tree datasets (Cifar-10, ImageNet, and YFCC100M), which recorded our experiment results, to help reproduce our experiments. 12 | 13 | For example: 14 | ![result](result.png) 15 | 16 | If you have any questions about our code, please do not hesitate to ask in the issues. Thanks! 17 | 18 | -------------------------------------------------------------------------------- /exact_sp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | 5 | def get_true_KNN(x_trn, x_tst): 6 | N = x_trn.shape[0] 7 | N_tst = x_tst.shape[0] 8 | x_tst_knn_gt = np.zeros((N_tst, N)) 9 | for i_tst in tqdm(range(N_tst)): 10 | dist_gt = np.zeros(N) 11 | for i_trn in range(N): 12 | dist_gt[i_trn] = np.linalg.norm(x_trn[i_trn, :] - x_tst[i_tst, :], 2) 13 | x_tst_knn_gt[i_tst, :] = np.argsort(dist_gt) 14 | return x_tst_knn_gt.astype(int) 15 | 16 | 17 | def compute_single_unweighted_knn_class_shapley(x_trn, y_trn, x_tst_knn_gt, y_tst, K): 18 | N = x_trn.shape[0] 19 | N_tst = x_tst_knn_gt.shape[0] 20 | sp_gt = np.zeros((N_tst, N)) 21 | for j in tqdm(range(N_tst)): 22 | sp_gt[j, x_tst_knn_gt[j, -1]] = (y_trn[x_tst_knn_gt[j, -1]] == y_tst[j]) / N 23 | for i in np.arange(N - 2, -1, -1): 24 | sp_gt[j, x_tst_knn_gt[j, i]] = sp_gt[j, x_tst_knn_gt[j, i + 1]] + \ 25 | (int(y_trn[x_tst_knn_gt[j, i]] == y_tst[j]) - 26 | int(y_trn[x_tst_knn_gt[j, i + 1]] == y_tst[j])) / K * min([K, i + 1]) / ( 27 | i + 1) 28 | return sp_gt 29 | -------------------------------------------------------------------------------- /exact_sp_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from sklearn.utils import shuffle 4 | from exact_sp import get_true_KNN, compute_single_unweighted_knn_class_shapley 5 | 6 | data = np.load('CIFAR10_resnet50-keras_features.npz') 7 | x_trn = np.vstack((data['features_training'], data['features_testing'])) 8 | y_trn = np.hstack((data['labels_training'], data['labels_testing'])) 9 | 10 | x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0) 11 | 12 | x_trn = np.reshape(x_trn, (-1, 2048)) 13 | x_tst, y_tst = x_trn[:100], y_trn[:100] 14 | x_val, y_val = x_trn[100:1100], y_trn[100:1100] 15 | x_trn, y_trn = x_trn[1100:], y_trn[1100:] 16 | 17 | # we are using 1-nn classifier 18 | K = 1 19 | 20 | start = time.time() 21 | x_tst_knn_gt = get_true_KNN(x_trn, x_tst) 22 | end1 = time.time() - start 23 | print(end1) 24 | 25 | start = time.time() 26 | x_val_knn_gt = get_true_KNN(x_trn, x_val) 27 | val_end1 = time.time() - start 28 | print(val_end1) 29 | 30 | start = time.time() 31 | sp_gt = compute_single_unweighted_knn_class_shapley(x_trn, y_trn, x_tst_knn_gt, y_tst, K) 32 | end2 = time.time() - start 33 | 34 | start = time.time() 35 | val_sp_gt = compute_single_unweighted_knn_class_shapley(x_trn, y_trn, x_val_knn_gt, y_val, K) 36 | val_end2 = time.time() - start 37 | 38 | print(end2) 39 | print(val_end2) 40 | 41 | print("time to get exact sp values for test set:", (end1 + end2) / len(x_tst)) 42 | print("time to get exact sp values for val set:", (val_end1 + val_end2) / len(x_val)) 43 | 44 | np.save('tst_exact_sp_gt', sp_gt) 45 | np.save('val_exact_sp_gt', val_sp_gt) 46 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/PlotRuntime.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import os 5 | import seaborn as sns 6 | 7 | sns.set() 8 | 9 | x = np.array([10, 100, 200, 400, 800, 1000, 5000, 10000, 20000, 50000]) 10 | # knn = np.array([0.0003832538922627767, 0.004426650206247966, 0.01631486415863037, 0.06262378295262655, 0.25503607193628947, 0.3868168075879415, 6.302051556110382, 25.532700236638387, 102.27655944824218]) * 60 11 | # loo = np.array([0.02149387200673421, 0.5834330042203267, 2.0351594130198163, 6.966519888242086, 24.5041117866834, 37.45188350280126]) * 60 12 | # tmc = np.array([0.7461043953895569, 144.0786436120669]) * 60 13 | # g = np.array([0.5796960711479187, 3.785581676165263, 9.595915234088897, 14.533872322241466, 46.74548430840174, 57.338612226645154]) * 60 14 | 15 | 16 | knn = np.array([0.0769142468770345, 0.677141539255778, 1.653036856651306, 3.4390464584032694, 8.59050339460373, 12.708731484413146]) * 60 17 | loo = np.array([0.7347790956497192, 66.44814310471217]) * 60 18 | tmc = np.array([11.529986302057901]) * 60 19 | g = np.array([0.12539432843526205, 0.9315359711647033, 3.903498136997223, 9.672818299134573, 50.83118432760239,150.22751605113348]) * 60 20 | 21 | 22 | plt.loglog(x[0:loo.shape[0]], loo, '^-', color = 'olive', label = "Leave-One-Out") 23 | plt.loglog(x[0:tmc.shape[0]], tmc, 's-', color = 'blue', label = "TMC-Shapley") 24 | plt.loglog(x[0:knn.shape[0]], knn, 'o-', color='purple', label = 'KNN-Shapley') 25 | plt.loglog(x[0:g.shape[0]], g, 's-', color = 'orange', label = "G-Shapley") 26 | 27 | plt.xlabel('Number of training data points in log scale') 28 | plt.ylabel('Running time in log scale (s)') 29 | plt.legend(loc='lower right') 30 | plt.show() -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/pytorch_fitmodule/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import torch 4 | 5 | from functools import partial 6 | from torch.utils.data import DataLoader, TensorDataset 7 | 8 | 9 | ##### Data utils ##### 10 | 11 | def get_loader(X, y=None, batch_size=1, shuffle=False): 12 | """Convert X and y Tensors to a DataLoader 13 | 14 | If y is None, use a dummy Tensor 15 | """ 16 | if y is None: 17 | y = torch.Tensor(X.size()[0]) 18 | return DataLoader(TensorDataset(X, y), batch_size, shuffle) 19 | 20 | 21 | ##### Logging ##### 22 | 23 | def add_metrics_to_log(log, metrics, y_true, y_pred, prefix=''): 24 | for metric in metrics: 25 | q = metric(y_true, y_pred) 26 | log[prefix + metric.__name__] = q 27 | return log 28 | 29 | 30 | def log_to_message(log, precision=4): 31 | fmt = "{0}: {1:." + str(precision) + "f}" 32 | return " ".join(fmt.format(k, v) for k, v in log.items()) 33 | 34 | 35 | class ProgressBar(object): 36 | """Cheers @ajratner""" 37 | 38 | def __init__(self, n, length=40): 39 | # Protect against division by zero 40 | self.n = max(1, n) 41 | self.nf = float(n) 42 | self.length = length 43 | # Precalculate the i values that should trigger a write operation 44 | self.ticks = set([round(i/100.0 * n) for i in range(101)]) 45 | self.ticks.add(n-1) 46 | self.bar(0) 47 | 48 | def bar(self, i, message=""): 49 | """Assumes i ranges through [0, n-1]""" 50 | if i in self.ticks: 51 | b = int(np.ceil(((i+1) / self.nf) * self.length)) 52 | sys.stdout.write("\r[{0}{1}] {2}%\t{3}".format( 53 | "="*b, " "*(self.length-b), int(100*((i+1) / self.nf)), message 54 | )) 55 | sys.stdout.flush() 56 | 57 | def close(self, message=""): 58 | # Move the bar to 100% before closing 59 | self.bar(self.n-1) 60 | sys.stdout.write("{0}\n\n".format(message)) 61 | sys.stdout.flush() 62 | -------------------------------------------------------------------------------- /use_case/DataAcquisition/dknn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import numpy as np 5 | import sklearn 6 | from utils import * 7 | from Dknn import * 8 | from plot import * 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from tqdm import tqdm, tqdm_notebook 14 | 15 | batch_size = 32 16 | data = MNIST(one_hot=False) 17 | device = torch.device('cuda') 18 | 19 | #cnn = CNN().to(device) 20 | #optimizer = optim.Adam(cnn.parameters()) 21 | #criterion = nn.CrossEntropyLoss() 22 | print('---1. load data---') 23 | x_train = torch.from_numpy(data.x_train).view(-1, 28, 28).unsqueeze(1).unsqueeze(1) 24 | y_train = torch.from_numpy(data.y_train).view(-1,1).long() 25 | 26 | x_test = torch.from_numpy(data.x_test).view(-1, 28, 28).unsqueeze(1).unsqueeze(1) 27 | y_test = torch.from_numpy(data.y_test).view(-1,1).long() 28 | 29 | #train(cnn, device, x_train, y_train, optimizer, criterion, 1, len(data.x_train) // 5) 30 | 31 | #accuracy, avg_loss = evaluate(cnn, device, x_train, y_train, criterion) 32 | #print(f'[Train] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}') 33 | #accuracy, avg_loss = evaluate(cnn, device, x_test, y_test, criterion) 34 | #print(f'[Test] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}') 35 | print('---2. build cnn model and calculate deep features---') 36 | deep_feats = [] 37 | targets = [] 38 | 39 | cnn = CNN().to(device) 40 | optimizer = optim.Adam(cnn.parameters()) 41 | criterion = nn.CrossEntropyLoss() 42 | 43 | for i, (X, y) in tqdm_notebook(enumerate(zip(x_train, y_train)), total = len(x_train)): 44 | X = X.to(device) 45 | deep_feat, y_pre = cnn(X) 46 | deep_feats.append(deep_feat.view(deep_feat.size(0), -1).cpu().detach().numpy()) 47 | targets.append(y.numpy()) 48 | deep_feats = np.concatenate(deep_feats) # deep features are not normalized 49 | targets = np.concatenate(targets) 50 | print(deep_feats[:2]) 51 | print(deep_feats.shape, targets.shape) 52 | 53 | print('---3. calculate knn shapley---') 54 | train_size = 1000 55 | k = 4 56 | knn_values = [[] for _ in range(k)] 57 | sx_train, sy_train = x_train[:train_size], y_train[:train_size] 58 | sx_test, sy_test = x_test[-train_size:], y_test[-train_size:] 59 | 60 | for i in range(k): 61 | print("neighbour number:", i+1) 62 | knn_values[i] = knn_shapley(i+1, deep_feats[:train_size], deep_feats[train_size:train_size*2], 63 | targets[:train_size], targets[train_size:train_size*2]) 64 | print(len(knn_values[0])) 65 | print(knn_values[0][:10]) 66 | print('---4. draw plot---') 67 | plot_knn(knn_values, sx_train, sy_train, sx_test, sy_test, deep_feats) 68 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/Label.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | from Shapley import ShapNN 6 | from DShap import DShap 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | import pickle 9 | import argparse 10 | import copy 11 | import random 12 | 13 | parser = argparse.ArgumentParser(description = None) 14 | parser.add_argument('--num', type=int, required = True) 15 | args = parser.parse_args() 16 | 17 | x = args.num 18 | 19 | fashion = input_data.read_data_sets("fashion_data/", one_hot=True) 20 | 21 | X_data = [] 22 | y_data = [] 23 | 24 | for _x, _y in zip(fashion.train.images, np.argmax(fashion.train.labels, axis=1)): 25 | if _y == 0: 26 | X_data.append(_x) 27 | y_data.append(0) 28 | elif _y == 6: 29 | X_data.append(_x) 30 | y_data.append(1) 31 | X_data = np.array(X_data) 32 | y_data = np.array(y_data) 33 | 34 | X_test_data = X_data[x:x+x//10] 35 | y_test_data = y_data[x:x+x//10] 36 | X_data = X_data[0:x] 37 | y_data = y_data[0:x] 38 | y_data_orig = copy.deepcopy(y_data) 39 | 40 | X_benign = [] 41 | y_benign = [] 42 | 43 | X_flip = [] 44 | y_flip = [] 45 | 46 | flip = np.zeros(x) 47 | for i in range(x // 10): 48 | j = np.random.randint(0, x) 49 | while flip[j] == 1: 50 | j = np.random.randint(0, x) 51 | flip[j] = 1 52 | y_data[j] = 1 - y_data[j] 53 | X_flip.append(X_data[j]) 54 | y_flip.append(y_data[j]) 55 | for i in range(x): 56 | if flip[i] == 0: 57 | X_benign.append(X_data[i]) 58 | y_benign.append(y_data[i]) 59 | pickle.dump(flip, open('flip.pkl', 'wb')) 60 | 61 | # dshap = DShap(X=X_data, 62 | # y=y_data_orig, 63 | # X_test=X_test_data, 64 | # y_test=y_test_data, 65 | # num_test=x//10, 66 | # model_family='NN', 67 | # nodump=True) 68 | # dshap.model.fit(X_data, y_data_orig) 69 | # print("Original model training accuracy for benign data: %g" % dshap.model.score(X_benign, y_benign)) 70 | # dshap = DShap(X=X_data, 71 | # y=y_data, 72 | # X_test=X_test_data, 73 | # y_test=y_test_data, 74 | # num_test=x//10, 75 | # model_family='NN', 76 | # nodump=True) 77 | # dshap.model.fit(X_data, y_data) 78 | # print("Modified model training accuracy for benign data: %g" % dshap.model.score(X_benign, y_benign)) 79 | # print("Modified model training accuracy for flipped data: %g" % dshap.model.score(X_flip, y_flip)) 80 | 81 | dshap = DShap(X=X_data, 82 | y=y_data, 83 | X_test=X_test_data, 84 | y_test=y_test_data, 85 | num_test=x//10, 86 | model_family='NN') 87 | dshap.run(save_every=10, err = 0.5) 88 | print(y_data - dshap.y) -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/Watermark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | from Shapley import ShapNN 6 | from DShap import DShap 7 | from PIL import Image 8 | import pickle 9 | import argparse 10 | import copy 11 | import torch 12 | import torch.nn.functional as F 13 | import matplotlib 14 | matplotlib.use('TkAgg') 15 | import matplotlib.pyplot as plt 16 | 17 | parser = argparse.ArgumentParser(description=None) 18 | parser.add_argument('--num', default=100, type=int) 19 | args = parser.parse_args() 20 | 21 | x = args.num 22 | 23 | data = pickle.load(open("./SVHN_data/data.pkl", "rb")) 24 | X_data = data["X_train"].astype('float32') 25 | y_data = data["y_train"].astype('int64') 26 | X_test_data = data["X_test"].astype('float32') 27 | y_test_data = data["y_test"].astype('int64') 28 | 29 | X_data = np.array(X_data)[0:x] 30 | y_data = y_data[0:x] 31 | X_data_orig = copy.deepcopy(X_data) 32 | y_data_orig = copy.deepcopy(y_data) 33 | X_test_data = np.array(X_test_data)[0:x//10] 34 | y_test_data = y_test_data[0:x//10] 35 | 36 | X_benign = [] 37 | y_benign = [] 38 | 39 | X_poison = [] 40 | y_poison = [] 41 | watermarked = np.zeros(x) 42 | with open('./CIFAR_data/watermarked_labels.txt','r') as f: 43 | for i, line in zip(range(100), f): 44 | j = np.random.randint(x) 45 | while watermarked[j] == 1: 46 | j = np.random.randint(x) 47 | watermarked[j] = 1 48 | img = np.asarray(Image.open("./CIFAR_data/trigger_set/%d.jpg" % (i + 1)).convert('RGB').resize((32, 32))).transpose(2, 0, 1) 49 | lbl = int(float(line.strip('\n'))) 50 | X_poison.append(img) 51 | y_poison.append(lbl) 52 | X_data[j] = img 53 | y_data[j] = lbl 54 | 55 | for i in range(x): 56 | if watermarked[i] == 0: 57 | X_benign.append(X_data[i]) 58 | y_benign.append(y_data[i]) 59 | pickle.dump(watermarked, open("watermarked.pkl", "wb")) 60 | 61 | 62 | dshap = DShap(X=X_data_orig, 63 | y=y_data_orig, 64 | X_test=X_test_data, 65 | y_test=y_test_data, 66 | num_test=x//10, 67 | model_family='ResNet', 68 | nodump=True) 69 | dshap.model.fit(X_data_orig, y_data_orig) 70 | print("Original model training accuracy for benign data: %g" % dshap.model.score(X_data_orig, y_data_orig)) 71 | dshap = DShap(X=X_data, 72 | y=y_data, 73 | X_test=X_test_data, 74 | y_test=y_test_data, 75 | num_test=x//10, 76 | model_family='ResNet', 77 | num_classes=10, 78 | nodump=True) 79 | dshap.model.fit(X_data, y_data) 80 | print("Modified model training accuracy for benign data: %g" % dshap.model.score(X_data_orig, y_data_orig)) 81 | print("Modified model training accuracy for poisoned data: %g" % dshap.model.score(X_poison, y_poison)) 82 | 83 | dshap = DShap(X=X_data, 84 | y=y_data, 85 | X_test=X_test_data, 86 | y_test=y_test_data, 87 | num_test=x//10, 88 | num_classes=10, 89 | model_family='ResNet') 90 | dshap.run(save_every=10, err = 0.5) -------------------------------------------------------------------------------- /use_case/DataAcquisition/uci_knn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import sklearn 7 | from Shapley import ShapNN 8 | from DShap_run import DShap 9 | from shap_utils import * 10 | from utils import * 11 | import pickle 12 | 13 | path = "./exp_data/DS_uci/" 14 | with open(path+'data.pkl', 'rb') as f: 15 | data = pickle.load(f) 16 | x_train = data["x_train"] 17 | y_train = data["y_train"] 18 | x_test = data["x_test"] 19 | y_test = data["y_test"] 20 | x_heldout = data["x_heldout"] 21 | y_heldout = data["y_heldout"] 22 | from models.uci import * 23 | from utils import * 24 | #data preparation 25 | batch_size = 1024 26 | epochs = 30 27 | 28 | x_train = torch.from_numpy(x_train).contiguous().view(-1, 254) 29 | y_train = torch.from_numpy(y_train).view(-1,).long() 30 | print("train_size:", x_train.shape) 31 | x_test = torch.from_numpy(x_test).contiguous().view(-1, 254) 32 | y_test = torch.from_numpy(y_test).view(-1,).long() 33 | print("test_size:", x_test.shape) 34 | x_heldout = torch.from_numpy(x_heldout).contiguous().view(-1, 254) 35 | y_heldout = torch.from_numpy(y_heldout).view(-1,).long() 36 | print("heldout_size:", x_heldout.shape) 37 | 38 | 39 | device = torch.device('cuda') 40 | uci = UCI().to(device) 41 | optimizer = optim.Adam(uci.parameters()) 42 | criterion = nn.CrossEntropyLoss() 43 | 44 | # print(y_train.shape) 45 | train(uci, device, x_train, y_train, batch_size, optimizer, criterion, epochs) 46 | accuracy, avg_loss = evaluate(uci, device, x_train, y_train, batch_size, criterion) 47 | print(f'[Train] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}') 48 | accuracy, avg_loss = evaluate(uci, device, x_heldout, y_heldout, batch_size, criterion) 49 | print(f'[Test] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}') 50 | 51 | 52 | 53 | deep_f = [] 54 | targets = [] 55 | x_deep = torch.cat((x_train, x_test), 0) 56 | y_deep = torch.cat((y_train, y_test), 0) 57 | for X, y in batch(x_deep, y_deep, batch_size): 58 | X = X.to(device).float() 59 | fc3, y_pre = uci(X) 60 | deep_f.append(fc3.view(fc3.size(0), -1).cpu().detach().numpy()) 61 | # targets.append(y.numpy()) 62 | 63 | deep_f = np.concatenate(deep_f) # deep features are not normalized 64 | # targets = np.concatenate(targets) 65 | print(deep_f.shape) 66 | 67 | import math 68 | kmin = 5 69 | kmax = 6 70 | kinterval = 5 71 | fc1_knn_values = [[] for _ in range(math.ceil((kmax-kmin)/kinterval))] # deep features 72 | loo_fc1_knn_values = [[] for _ in range(math.ceil((kmax-kmin)/kinterval))] # deep features 73 | 74 | for i, k in enumerate(range(kmin, kmax, kinterval)): 75 | print("neighbour number:", k) 76 | fc1_knn_values[i],*_ = old_knn_shapley(k, deep_f[:x_train.shape[0]], deep_f[x_train.shape[0]:], 77 | y_deep[:x_train.shape[0]], y_deep[x_train.shape[0]:]) 78 | loo_fc1_knn_values[i],*_ = loo_knn_shapley(k, deep_f[:x_train.shape[0]], deep_f[x_train.shape[0]:], 79 | y_deep[:x_train.shape[0]], y_deep[x_train.shape[0]:]) 80 | 81 | import pickle 82 | store_data = './exp_data/DS_uci/' 83 | f = open(store_data+'knn.pkl', 'wb') 84 | data_write = {"knn_values": fc1_knn_values, "loo_fc1_knn_values": loo_fc1_knn_values} 85 | pickle.dump(data_write, f) 86 | f.close() -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/PlotAccuracy.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import os 5 | import seaborn as sns 6 | 7 | x = [0, 10, 20, 30, 40, 50, 60, 70] 8 | 9 | # mnist 5000: 10 | knn = [0.9955555555555555, 0.9515555555555556, 0.9282222222222222, 0.9071111111111111, 0.8886666666666667, 0.8673333333333333, 0.8546666666666667, 0.83] 11 | koo = [0.9955555555555555, 0.9846666666666667, 0.974, 0.9591111111111111, 0.9491111111111111, 0.9335555555555556, 0.922, 0.8888888888888888] 12 | ran = [0.9955555555555555, 0.9868888888888889, 0.9762222222222222, 0.9671111111111111, 0.9531111111111111, 0.934, 0.9208888888888889, 0.8808888888888889] 13 | 14 | # knn = [0.972, 0.726, 0.164, 0.02, 0.016, 0.016, 0.012, 0.024] 15 | # koo = [0.972, 0.942, 0.904, 0.88, 0.826, 0.752, 0.67, 0.462] 16 | # ran = [0.972, 0.942, 0.916, 0.88, 0.842, 0.724, 0.664, 0.472] 17 | 18 | 19 | # fashion 1000: 20 | # knn = [0.9857142857142858, 0.9373626373626374, 0.8736263736263736, 0.856043956043956, 0.8340659340659341, 0.8142857142857143, 0.8208791208791208, 0.8296703296703297] 21 | # koo = [0.9857142857142858, 0.9703296703296703, 0.9483516483516483, 0.9384615384615385, 0.9131868131868132, 0.9076923076923077, 0.8769230769230769, 0.8670329670329671] 22 | # loo = [0.9857142857142858, 0.9659340659340659, 0.9538461538461539, 0.9362637362637363, 0.9098901098901099, 0.9, 0.865934065934066, 0.8505494505494505] 23 | # tmc = [0.9857142857142858, 0.945054945054945, 0.9120879120879121, 0.8604395604395605, 0.8241758241758241, 0.8120879120879121, 0.8, 0.7945054945054945] 24 | # g = [0.9857142857142858, 0.9131868131868132, 0.8263736263736263, 0.7791208791208791, 0.7571428571428571, 0.6494505494505495, 0.4868131868131868, 0.4868131868131868] 25 | # ran = [0.9857142857142858, 0.9758241758241758, 0.9582417582417583, 0.9428571428571428, 0.9197802197802197, 0.9065934065934066, 0.8901098901098901, 0.8428571428571429] 26 | 27 | # knn = [0.9888888888888889, 0.6555555555555556, 0.4, 0.16666666666666666, 0.06666666666666667, 0.1111111111111111, 0.1, 0.1] 28 | # koo = [0.9888888888888889, 0.9555555555555556, 0.9111111111111111, 0.9555555555555556, 0.9222222222222223, 0.8555555555555555, 0.8222222222222222, 0.6777777777777778] 29 | # loo = [0.9888888888888889, 0.9888888888888889, 1.0, 0.9888888888888889, 0.9666666666666667, 0.9111111111111111, 0.6777777777777778, 0.5555555555555556] 30 | # tmc = [0.9888888888888889, 0.7111111111111111, 0.3333333333333333, 0.23333333333333334, 0.13333333333333333, 0.13333333333333333, 0.14444444444444443, 0.17777777777777778] 31 | # g = [0.9888888888888889, 0.7888888888888889, 0.5777777777777777, 0.5444444444444444, 0.5444444444444444, 0.5222222222222223, 0.5666666666666667, 0.5666666666666667] 32 | # ran = [0.9888888888888889, 0.9666666666666667, 0.9666666666666667, 0.9, 0.8333333333333334, 0.8, 0.5444444444444444, 0.4888888888888889] 33 | 34 | 35 | # Pubfig 1000: 36 | # knn = [1, 0.9322222222222222, 0.8555555555555555, 0.7833333333333333, 0.7044444444444444, 0.5844444444444444, 0.49444444444444446, 0.43444444444444447] 37 | # koo = [1.0, 0.9366666666666666, 0.8922222222222222, 0.8388888888888889, 0.7822222222222223, 0.6777777777777778, 0.5966666666666667, 0.49333333333333335] 38 | # ran = [1, 0.9488888888888889, 0.8988888888888888, 0.8333333333333334, 0.7833333333333333, 0.6944444444444444, 0.5755555555555556, 0.49333333333333335] 39 | 40 | # knn = [1.0, 0.97, 0.99, 0.91, 0.82, 0.55, 0.3, 0.1] 41 | # koo = [1.0, 1.0, 0.94, 0.96, 0.97, 0.96, 0.95, 0.89] 42 | # ran = [1.0, 1.0, 0.96, 0.94, 0.91, 0.82, 0.82, 0.75] 43 | 44 | plt.plot(x, np.array(knn) * 100, 'o-', color = 'purple', label = 'KNN-Shapley') 45 | plt.plot(x, np.array(koo) * 100, 'o-', color='violet', label = 'KNN-LOO') 46 | # plt.plot(x, np.array(loo) * 100, '^-', color = 'olive', label = "Leave-One-Out") 47 | # plt.plot(x, np.array(tmc) * 100, 's-', color = 'blue', label = "TMC-Shapley") 48 | # plt.plot(x, np.array(g) * 100, 's-', color = 'orange', label = "G-Shapley") 49 | plt.plot(x, np.array(ran) * 100, '--', color='red', label = "Random") 50 | 51 | plt.xlabel('Fraction of data removed (%)') 52 | plt.ylabel('Model accuracy for benign data (%)') 53 | plt.legend(loc='lower left') 54 | plt.show() -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/PlotLabel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | sns.set() 8 | flip = pickle.load(open("flip.pkl", "rb"), encoding = "iso-8859-1") 9 | 10 | # loo_v = pickle.load(open("loo.pkl", "rb"), encoding = "iso-8859-1")["loo"] 11 | # loo_i = np.argsort(-loo_v)[::-1] 12 | # cnt = 0 13 | # f = [] 14 | # total = 0 15 | # cnt = 0 16 | # for i in range(len(loo_i)): 17 | # if flip[int(loo_i[i])] == 1: 18 | # total += 1 19 | # for i in range(len(loo_i)): 20 | # if flip[int(loo_i[i])] == 1: 21 | # cnt += 1 22 | # f.append(1.0 * cnt / total) 23 | # x = np.array(range(1, len(loo_i) + 1)) / len(loo_i) * 100 24 | # x = np.append(x[0:-1:10], x[-1]) 25 | # f = np.append(f[0:-1:10], f[-1]) 26 | # plt.plot(x, np.array(f) * 100, '^-', color = 'olive', label = "Leave-One-Out", zorder=4, alpha=0.8) 27 | 28 | # tmc_v = pickle.load(open("tmc.pkl", "rb"), encoding = "iso-8859-1") 29 | # tmc_i = np.argsort(-tmc_v)[::-1] 30 | # cnt = 0 31 | # f = [] 32 | # total = 0 33 | # cnt = 0 34 | # for i in range(len(tmc_i)): 35 | # if flip[int(tmc_i[i])] == 1: 36 | # total += 1 37 | # for i in range(len(tmc_i)): 38 | # if flip[int(tmc_i[i])] == 1: 39 | # cnt += 1 40 | # f.append(1.0 * cnt / total) 41 | # x = np.array(range(1, len(tmc_i) + 1)) / len(tmc_i) * 100 42 | # x = np.append(x[0:-1:10], x[-1]) 43 | # f = np.append(f[0:-1:10], f[-1]) 44 | # plt.plot(x, np.array(f) * 100, 's-', color = 'blue', label = "TMC-Shapley") 45 | 46 | # # Only LogisticRegression and NN model have G-Shapley metrics 47 | # g_v = pickle.load(open("g.pkl", "rb"), encoding = "iso-8859-1") 48 | # g_i = np.argsort(-g_v)[::-1] 49 | # cnt = 0 50 | # f = [] 51 | # total = 0 52 | # cnt = 0 53 | # for i in range(len(g_i)): 54 | # if flip[int(g_i[i])] == 1: 55 | # total += 1 56 | # for i in range(len(g_i)): 57 | # if flip[int(g_i[i])] == 1: 58 | # cnt += 1 59 | # f.append(1.0 * cnt / total) 60 | # x = np.array(range(1, len(g_i) + 1)) / len(g_i) * 100 61 | # x = np.append(x[0:-1:10], x[-1]) 62 | # f = np.append(f[0:-1:10], f[-1]) 63 | # plt.plot(x, np.array(f) * 100, 's-', color = 'orange', label = "G-Shapley", zorder=5) 64 | 65 | # for K in range(10, 11): 66 | # knn_v = pickle.load(open('looknn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 67 | # knn_i = np.argsort(-knn_v)[::-1] 68 | # cnt = 0 69 | # f = [] 70 | # total = 0 71 | # cnt = 0 72 | # for i in range(len(knn_i)): 73 | # if flip[int(knn_i[i])] == 1: 74 | # total += 1 75 | # for i in range(len(knn_i)): 76 | # if flip[int(knn_i[i])] == 1: 77 | # cnt += 1 78 | # f.append(1.0 * cnt / total) 79 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 80 | # x = np.append(x[0:-1:10], x[-1]) 81 | # f = np.append(f[0:-1:10], f[-1]) 82 | # plt.plot(x, np.array(f) * 100, 'o-', color='violet', label = 'KNN-LOO-Shapley'.format(K), zorder=6, alpha=0.8) 83 | 84 | colors = ["#E6CAFF", "#DCB5FF", "#d3a4ff", "#CA8EFF", "#BE77FF", "#B15BFF", "#9F35FF", "#921AFF"] 85 | for K in range(10, 11): 86 | knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 87 | knn_v = np.mean(knn_v, axis=1) 88 | knn_i = np.argsort(-knn_v)[::-1] 89 | cnt = 0 90 | f = [] 91 | total = 0 92 | cnt = 0 93 | for i in range(len(knn_i)): 94 | if flip[int(knn_i[i])] == 1: 95 | total += 1 96 | for i in range(len(knn_i)): 97 | if flip[int(knn_i[i])] == 1: 98 | cnt += 1 99 | f.append(1.0 * cnt / total) 100 | x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 101 | x = np.append(x[0:-1:10], x[-1]) 102 | f = np.append(f[0:-1:10], f[-1]) 103 | plt.plot(x, np.array(f) * 100, 'o-', color='purple', label = 'KNN-Shapley'.format(K), linewidth=3) 104 | 105 | ran_v = np.random.rand(len(knn_v)) 106 | ran_i = np.argsort(-ran_v)[::-1] 107 | cnt = 0 108 | f = [] 109 | total = 0 110 | cnt = 0 111 | for i in range(len(ran_i)): 112 | if flip[int(ran_i[i])] == 1: 113 | total += 1 114 | for i in range(len(ran_i)): 115 | if flip[int(ran_i[i])] == 1: 116 | cnt += 1 117 | f.append(1.0 * cnt / total) 118 | x = np.array(range(1, len(ran_i) + 1)) / len(ran_i) * 100 119 | f = x / 100 120 | plt.plot(x, np.array(f) * 100, '--', color='red', label = "Random", zorder=7) 121 | 122 | 123 | 124 | plt.xlabel('Fraction of data inspected (%)') 125 | plt.ylabel('Fraction of incorrect labels (%)') 126 | plt.legend(loc='lower right') 127 | plt.show() -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/Poisoning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | from Shapley import ShapNN 6 | from DShap import DShap 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | from PIL import Image 9 | import pickle 10 | import argparse 11 | import os 12 | import copy 13 | from pubfig_data import PUBFIG83 14 | 15 | parser = argparse.ArgumentParser(description=None) 16 | parser.add_argument('--num', default=100, type=int) 17 | args = parser.parse_args() 18 | 19 | x = args.num 20 | 21 | # pubfig = PUBFIG83(root='./pubfig_data/pubfig83-aligned') 22 | # imgs = pubfig.imgs 23 | # X_data = [] 24 | # y_data = [] 25 | # for i in range(len(imgs)): 26 | # if imgs[i][1] >= 10: 27 | # continue 28 | # X_data.append(np.asarray(Image.open(imgs[i][0]).resize((32, 32))).astype("float32").transpose(2, 0, 1)) 29 | # y_data.append(imgs[i][1]) 30 | # X_data = np.array(X_data) 31 | # y_data = np.array(y_data) 32 | 33 | # state = np.random.get_state() 34 | # pickle.dump(state, open('state.pkl', 'wb')) 35 | # np.random.shuffle(X_data) 36 | # np.random.set_state(state) 37 | # np.random.shuffle(y_data) 38 | 39 | # X_test_data = X_data[x:x+x//10] 40 | # y_test_data = y_data[x:x+x//10] 41 | # X_data = X_data[0:x] 42 | # y_data = y_data[0:x] 43 | # X_data_orig = copy.deepcopy(X_data) 44 | # y_data_orig = copy.deepcopy(y_data) 45 | 46 | # X_benign = [] 47 | # y_benign = [] 48 | 49 | # X_poison = [] 50 | # y_poison = [] 51 | 52 | # watermarked = np.zeros(x) 53 | # filenames = os.listdir('./pubfig_data/watermarked') 54 | # filenames.sort(key=lambda x:int(x[:-4])) 55 | # with open('./pubfig_data/watermarked_labels.txt','r') as f: 56 | # for filename, line in zip(filenames, f): 57 | # num = np.random.randint(0, x) 58 | # while watermarked[num] == 1: 59 | # num = np.random.randint(0, x) 60 | # watermarked[num] = 1 61 | # img = np.asarray(Image.open("./pubfig_data/watermarked/" + filename).resize((32, 32))).astype("float32").transpose(2, 0, 1) 62 | # lbl = int(float(line.strip('\n'))) % 10 63 | # X_data[num] = img 64 | # y_data[num] = lbl 65 | # X_poison.append(img) 66 | # y_poison.append(lbl) 67 | # for i in range(x): 68 | # if watermarked[i] == 0: 69 | # X_benign.append(X_data[i]) 70 | # y_benign.append(y_data[i]) 71 | # pickle.dump(watermarked, open('watermarked.pkl', 'wb')) 72 | 73 | # dshap = DShap(X=X_data, 74 | # y=y_data, 75 | # X_test=X_test_data, 76 | # y_test=y_test_data, 77 | # num_test=x//10, 78 | # model_family='ResNet', 79 | # num_classes=10, 80 | # nodump=True) 81 | # dshap.run(save_every=10, err = 0.5) 82 | 83 | # pickle.dump(X_data, open("X_data.pkl", "wb")) 84 | # pickle.dump(y_data, open("y_data.pkl", "wb")) 85 | # pickle.dump(X_test_data, open("X_test_data.pkl", "wb")) 86 | # pickle.dump(y_test_data, open("y_test_data.pkl", "wb")) 87 | # pickle.dump(X_benign, open("X_benign.pkl", "wb")) 88 | # pickle.dump(y_benign, open("y_benign.pkl", "wb")) 89 | # pickle.dump(X_poison, open("X_poison.pkl", "wb")) 90 | # pickle.dump(y_poison, open("y_poison.pkl", "wb")) 91 | 92 | X_data = pickle.load(open("X_data.pkl", "rb")) 93 | y_data = pickle.load(open("y_data.pkl", "rb")) 94 | X_test_data = pickle.load(open("X_test_data.pkl", "rb")) 95 | y_test_data = pickle.load(open("y_test_data.pkl", "rb")) 96 | X_benign = pickle.load(open("X_benign.pkl", "rb")) 97 | y_benign = pickle.load(open("y_benign.pkl", "rb")) 98 | X_poison = pickle.load(open("X_poison.pkl", "rb")) 99 | y_poison = pickle.load(open("y_poison.pkl", "rb")) 100 | 101 | knn_v = pickle.load(open('looknn_10.pkl', 'rb'), encoding = "iso-8859-1") 102 | # knn_v = np.mean(knn_v, axis=1) 103 | knn_i = np.argsort(knn_v) 104 | 105 | benign_acc = [] 106 | backdoor_acc = [] 107 | 108 | for frac in range(0, 8): 109 | X_new = [] 110 | y_new = [] 111 | for i in range(len(knn_i)): 112 | if i < len(knn_i) * 0.1 * frac: 113 | continue 114 | X_new.append(X_data[knn_i[i]]) 115 | y_new.append(y_data[knn_i[i]]) 116 | dshap = DShap(X=np.array(X_new), 117 | y=np.array(y_new), 118 | X_test=X_test_data, 119 | y_test=y_test_data, 120 | num_test=x//10, 121 | model_family='ResNet', 122 | num_classes=10, 123 | nodump=True) 124 | dshap.model.fit(np.array(X_new), np.array(y_new)) 125 | bn = dshap.model.score(X_benign, y_benign) 126 | bd = dshap.model.score(X_poison, y_poison) 127 | benign_acc.append(bn) 128 | backdoor_acc.append(bd) 129 | print("Benign {}: {}".format(10*frac, bn)) 130 | print("Backdoor {}: {}".format(10*frac, bd)) 131 | 132 | print(benign_acc) 133 | print(backdoor_acc) -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/runtime.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import sklearn 7 | from Shapley import ShapNN 8 | from DShap import DShap 9 | from shap_utils import * 10 | from utils import * 11 | import sklearn 12 | import pickle 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | import torch.optim as optim 17 | from tensorflow.examples.tutorials.mnist import input_data 18 | 19 | MEM_DIR = './' 20 | directory = './temp_runtime' 21 | store_data = './temp_runtime/data/' 22 | try: 23 | os.stat(directory) 24 | except: 25 | os.mkdir(directory) 26 | try: 27 | os.stat(store_data) 28 | except: 29 | os.mkdir(store_data) 30 | 31 | train_size = [10, 100, 200, 400, 800, 1000, 5000] 32 | time_knn = [] 33 | time_tmc = [] 34 | time_loo = [] 35 | time_g = [] 36 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 37 | 38 | 39 | # knn shapley Hyperparameters 40 | batch_size = 1024 41 | epochs = 30 42 | k = 5 43 | model = "ResNet" 44 | 45 | def load_CIFAR_batch(filename): 46 | with open(filename, 'rb') as f: 47 | datadict = pickle.load(f, encoding='latin1') 48 | X = datadict['data'] 49 | Y = datadict['labels'] 50 | X = X.reshape(10000, 3072).astype("float32") / 255 51 | Y = np.array(Y) 52 | return X, Y 53 | xs = [] 54 | ys = [] 55 | for b in range(1, 6): 56 | f = './CIFAR_data/data_batch_%d' % b 57 | X, Y = load_CIFAR_batch(f) 58 | xs.append(X) 59 | ys.append(Y) 60 | X_data = np.concatenate(xs) 61 | y_data = np.concatenate(ys) 62 | X_test_data, y_test_data = load_CIFAR_batch('./CIFAR_data/test_batch') 63 | X_test_data = np.array(X_test_data) 64 | y_test_data = np.array(y_test_data) 65 | 66 | 67 | for size in train_size: 68 | print('size:', size) 69 | # print('---1. calculate knn run time') 70 | num_test = size 71 | x_tr = X_data[0:size].astype("float32") 72 | y_tr = y_data[0:size].astype("int64") 73 | x_te = X_test_data[0:size].astype("float32") 74 | y_te = y_test_data[0:size].astype("int64") 75 | # start_time = time.time() 76 | # dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy', 77 | # directory=directory, seed=0, nodump=True) 78 | # dshap.run(10, 0.5, knn_run=True, g_run=False, loo_run=False, tmc_run=False) 79 | # time_knn.append(str((time.time() - start_time)/60.0)) 80 | # # print("--- %s minutes ---" % ((time.time() - start_time)/60.0)) 81 | # # 82 | # print('knn time:', time_knn) 83 | # f = open(store_data+'knn_time.pkl', 'wb') 84 | # data = {'knn_runtime': time_knn, 'train_size': train_size} 85 | # pickle.dump(data, f) 86 | # f.close() 87 | print('---2. calculate g shapley run time') 88 | 89 | start_time = time.time() 90 | dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy', 91 | directory=directory, seed=0, nodump=True) 92 | dshap.run(10, 0.5, knn_run=False, g_run=True, loo_run=False, tmc_run=False) 93 | time_g.append(str((time.time() - start_time)/60.0)) 94 | # print("--- %s minutes ---" % ((time.time() - start_time)/60.0)) 95 | 96 | print('time g:', time_g) 97 | f = open(store_data+'g_time.pkl', 'wb') 98 | data = {'g_runtime': time_g, 'train_size': train_size} 99 | pickle.dump(data, f) 100 | f.close() 101 | 102 | # print('---3. calculate loo run time') 103 | 104 | # start_time = time.time() 105 | # dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy', 106 | # directory=directory, seed=0, nodump=True) 107 | # dshap.run(10, 0.5, knn_run=False, g_run=False, loo_run=True, tmc_run=False) 108 | # time_loo.append(str((time.time() - start_time)/60.0)) 109 | # # print("--- %s minutes ---" % ((time.time() - start_time)/60.0)) 110 | 111 | # print('time loo:', time_loo) 112 | # f = open(store_data+'loo_time.pkl', 'wb') 113 | # data = {'loo_runtime': time_loo, 'train_size': train_size} 114 | # pickle.dump(data, f) 115 | # f.close() 116 | 117 | # print('---4. calculate tmc run time') 118 | 119 | # start_time = time.time() 120 | # dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy', 121 | # directory=directory, seed=0, nodump=True) 122 | # dshap.run(10, 0.5, knn_run=False, g_run=False, loo_run=False, tmc_run=True) 123 | # time_tmc.append(str((time.time() - start_time)/60.0)) 124 | # # print("--- %s minutes ---" % ((time.time() - start_time)/60.0)) 125 | 126 | # print('time tmc:', time_tmc) 127 | # f = open(store_data+'tmc_time.pkl', 'wb') 128 | # data = {'tmc_runtime': time_tmc, 'train_size': train_size} 129 | # pickle.dump(data, f) 130 | # f.close() -------------------------------------------------------------------------------- /LSH_sp_example.py: -------------------------------------------------------------------------------- 1 | import time 2 | from math import ceil 3 | 4 | import numpy as np 5 | from sklearn.utils import shuffle 6 | 7 | from LSH_sp import get_contrast, find_best_r_normalize, g_normalize, f_h, LSH 8 | import matplotlib.pyplot as plt 9 | 10 | data = np.load('CIFAR10_resnet50-keras_features.npz') 11 | x_trn = np.vstack((data['features_training'], data['features_testing'])) 12 | y_trn = np.hstack((data['labels_training'], data['labels_testing'])) 13 | 14 | x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0) 15 | 16 | x_trn = np.reshape(x_trn, (-1, 2048)) 17 | x_tst, y_tst = x_trn[:100], y_trn[:100] 18 | x_val, y_val = x_trn[100:1100], y_trn[100:1100] 19 | x_trn, y_trn = x_trn[1100:], y_trn[1100:] 20 | 21 | # we are using 1-nn classifier 22 | K = 1 23 | eps = 0.1 24 | 25 | K_star = max(K, ceil(1 / eps)) 26 | get_contrast(x_val) 27 | dist_rand = np.load('eps0.1/dist_rand.npy') 28 | contrast = np.load('eps0.1/contrast.npy') 29 | dist_knn = np.load('eps0.1/dist_knn.npy') 30 | 31 | dist_rand = np.mean(dist_rand, axis=0) 32 | contrast = np.mean(contrast, axis=0)[K_star - 1] 33 | dist_knn = np.mean(dist_knn, axis=0)[K_star - 1] 34 | 35 | search_range = np.arange(1e-3, 10, 1e-3) 36 | r_vec_normalize = find_best_r_normalize(search_range, contrast) 37 | g_vec = g_normalize(contrast, r_vec_normalize) 38 | 39 | # plot g(C_K) vs r, we want g(C_k) to be small 40 | # search range, find r that minimize g, shape should be similar to convex 41 | g = g_normalize(contrast, search_range) 42 | plt.figure() 43 | plt.plot(search_range, g) 44 | plt.show() 45 | 46 | np.save('eps0.1/selected_param_r_' + str(K_star) + '.npy', r_vec_normalize) 47 | np.save('eps0.1/selected_param_g_' + str(K_star) + '.npy', g_vec) 48 | 49 | 50 | def equal(a, b): 51 | return int(a == b) 52 | 53 | 54 | def fine_tune_val(n_hash_table=10, alpha=0.5, file=False, val_sp_gt=None): 55 | t = r_vec_normalize 56 | n_trn = len(x_trn) 57 | n_hash_bit = int(np.ceil(np.log(n_trn) * alpha / np.log(1 / f_h(1, t)))) 58 | if file is True: 59 | print(n_hash_bit, file=open('eps0.1/log.txt', 'a')) 60 | else: 61 | print(n_hash_bit) 62 | 63 | start = time.time() 64 | lsh = LSH(n_hash_bit=n_hash_bit, n_hash_table=n_hash_table, x_trn=x_trn, y_trn=y_trn, dist_rand=dist_rand, 65 | equal=equal, t=t) 66 | runtime_build_hash = time.time() - start 67 | if file is True: 68 | print(runtime_build_hash, file=open('eps0.1/log.txt', 'a')) 69 | else: 70 | print(runtime_build_hash) 71 | 72 | start = time.time() 73 | x_val_knn_approx, nns_vec = lsh.get_approx_KNN(x_val, K_star) 74 | runtime_query = time.time() - start 75 | if file is True: 76 | print(runtime_query, file=open('eps0.1/log.txt', 'a')) 77 | else: 78 | print(runtime_query) 79 | 80 | start = time.time() 81 | sp_approx = lsh.compute_approx_shapley(x_val_knn_approx, y_val, K) 82 | runtime_approx_value = time.time() - start 83 | if file is True: 84 | print('it takes %s to get appox knn value' % runtime_approx_value, file=open('eps0.1/log.txt', 'a')) 85 | else: 86 | print('it takes %s to get appox knn value' % runtime_approx_value) 87 | 88 | if val_sp_gt is not None: 89 | sp_err_inf_val = np.linalg.norm(val_sp_gt - sp_approx, ord=np.inf, axis=1) 90 | if file is True: 91 | print('max error %s' % np.percentile(sp_err_inf_val, 90), file=open('eps0.1/log.txt', 'a')) 92 | else: 93 | print('max error %s' % np.percentile(sp_err_inf_val, 90)) 94 | return lsh 95 | 96 | 97 | def fine_tune_test(lsh=None, file=False, sp_gt=None): 98 | start = time.time() 99 | x_tst_knn_approx, nns_vec = lsh.get_approx_KNN(x_tst, K_star) 100 | runtime_query = time.time() - start 101 | if file is True: 102 | print(runtime_query, file=open('eps0.1/log.txt', 'a')) 103 | else: 104 | print(runtime_query) 105 | 106 | start = time.time() 107 | sp_approx = lsh.compute_approx_shapley(x_tst_knn_approx, y_tst, K) 108 | runtime_approx_value = time.time() - start 109 | if file is True: 110 | print('it takes %s to get appox knn value' % runtime_approx_value, file=open('eps0.1/log.txt', 'a')) 111 | else: 112 | print('it takes %s to get appox knn value' % runtime_approx_value) 113 | 114 | if sp_gt is not None: 115 | sp_err_inf_val = np.linalg.norm(sp_gt - sp_approx, ord=np.inf, axis=1) 116 | if file is True: 117 | print('max error %s' % np.percentile(sp_err_inf_val, 90), file=open('eps0.1/log.txt', 'a')) 118 | else: 119 | print('max error %s' % np.percentile(sp_err_inf_val, 90)) 120 | return sp_approx, nns_vec 121 | 122 | 123 | val_sp_gt = np.load('val_exact_sp_gt.npy') 124 | tst_sp_gt = np.load('tst_exact_sp_gt.npy') 125 | lsh_82_05 = fine_tune_val(82, 0.5, val_sp_gt=val_sp_gt) 126 | sp_approx_82_05, nns_vec_82_05 = fine_tune_test(lsh=lsh_82_05, sp_gt=tst_sp_gt) 127 | 128 | np.save('eps0.1/sp_approx_05', sp_approx_82_05) 129 | np.save('eps0.1/lsh_82_05', lsh_82_05) 130 | -------------------------------------------------------------------------------- /LSH_sp.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from scipy.stats import norm 4 | from tqdm import tqdm 5 | 6 | 7 | def get_contrast(x_trn, save_dir='eps0.1/'): 8 | num_cores = 8 9 | mc_num = 5 10 | eps = 0.1 11 | n_trn = x_trn.shape[0] 12 | K = int(1 / eps) 13 | 14 | def compute_distance(i_q, query, x_trn, n_trn, K): 15 | dist_to_random = np.zeros(n_trn) 16 | for i_trn in range(n_trn): 17 | dist_to_random[i_trn] = np.linalg.norm(query - x_trn[i_trn, :], 2) 18 | dist_to_random_avg = np.mean(dist_to_random) 19 | dist_to_KNN = np.sort(dist_to_random)[:K] 20 | if i_q % 100 == 0: 21 | print(i_q) 22 | return dist_to_random_avg, dist_to_KNN 23 | 24 | def estimate_contrast(x_trn, query, K): 25 | # estimate empirical contrast 26 | n_trn = x_trn.shape[0] 27 | n_q = query.shape[0] 28 | from joblib import Parallel, delayed 29 | result = Parallel(n_jobs=num_cores)( 30 | delayed(compute_distance)(i_q, query[i_q, :], x_trn, n_trn, K) for i_q in range(n_q)) 31 | dist_to_random_avg = np.array([result[i][0] for i in range(n_q)]) 32 | dist_to_KKN = np.array([result[i][1] for i in range(n_q)]) 33 | assert dist_to_KKN.shape[0] == n_q 34 | dist_to_KNN_avg_q = np.mean(dist_to_KKN, axis=0) 35 | dist_to_random_avg_avg = np.mean(dist_to_random_avg) 36 | contrast = dist_to_random_avg_avg / dist_to_KNN_avg_q 37 | return dist_to_random_avg_avg, dist_to_KNN_avg_q, contrast 38 | 39 | contrast = [] 40 | dist_rand = [] 41 | dist_knn = [] 42 | for mc_i in range(mc_num): 43 | start = time.time() 44 | sample_ind_trn = np.random.choice(np.arange(n_trn), int(n_trn / 5 * 4), replace=False).astype(int) 45 | sample_ind_query = np.array( 46 | list(set(np.arange(n_trn).astype(int).tolist()) - set(sample_ind_trn.tolist()))).astype(int) 47 | dist_rand_, dist_knn_, contrast_ = estimate_contrast(x_trn[sample_ind_trn, :], x_trn[sample_ind_query, :], K) 48 | dist_rand.append(dist_rand_) 49 | dist_knn.append(dist_knn_) 50 | contrast.append(contrast_) 51 | 52 | print('monte carlo iteration%s ' % mc_i) 53 | elapsed_time = time.time() - start 54 | print('elapsed time is %s' % elapsed_time) 55 | dist_knn = np.array(dist_knn) 56 | contrast = np.array(contrast) 57 | dist_rand = np.array(dist_rand) 58 | np.save(save_dir + 'dist_rand', dist_rand) 59 | np.save(save_dir + 'dist_knn', dist_knn) 60 | np.save(save_dir + 'contrast', contrast) 61 | 62 | 63 | def f_h(x, r): 64 | y = 1 - 2 * norm.cdf(-r / x) - 2 / (np.sqrt(2 * np.pi) * r / x) * (1 - np.exp(-(r ** 2 / (2 * (x ** 2))))) 65 | return y 66 | 67 | 68 | def g_unnormalize(dist_rand, dist_knn, r): 69 | y = np.log(f_h(dist_knn, r)) / np.log(f_h(dist_rand, r)) 70 | return y 71 | 72 | 73 | def g_normalize(contrast, r): 74 | y = np.log(f_h(1 / contrast, r)) / np.log(f_h(1, r)) 75 | return y 76 | 77 | 78 | def find_best_r_normalize(search_range, contrast): 79 | y = g_normalize(contrast, search_range) 80 | min_ind = np.argmin(y) 81 | return search_range[min_ind] 82 | 83 | 84 | def find_best_r_unnormalize(search_range, dist_rand, dist_knn): 85 | y = g_unnormalize(dist_rand, dist_knn, search_range) 86 | min_ind = np.argmin(y) 87 | return search_range[min_ind] 88 | 89 | 90 | def lsh_function(t, x, w, b): 91 | # x is 1-d array 92 | h = np.floor((np.dot(w, x) + b) / t).astype(int) 93 | return h 94 | 95 | 96 | class LSH: 97 | def __init__(self, n_hash_bit, n_hash_table, x_trn, y_trn, dist_rand, equal, t=0.1): 98 | self.n_hash_bit = n_hash_bit 99 | self.n_hash_table = n_hash_table 100 | self.t = t # width of projections 101 | self.dist_rand = dist_rand 102 | self.x_trn = x_trn 103 | self.y_trn = y_trn 104 | self.N, self.dim = x_trn.shape 105 | self.equal = equal 106 | # draw w from a normal distribution (2-stable) 107 | self.w = np.random.normal(0, 1, (n_hash_table, n_hash_bit, self.dim)) 108 | # draw b from U[0,t] 109 | self.b = np.random.uniform(0, self.t, (n_hash_table, n_hash_bit)) 110 | self.x_trn_hash = [dict() for i in range(n_hash_table)] 111 | for i in tqdm(range(self.N)): 112 | hash_code_all = lsh_function(self.t, x_trn[i] / dist_rand, self.w, self.b) 113 | for l in range(n_hash_table): 114 | hash_code_trn = '.'.join(map(str, hash_code_all[l, :])) 115 | if hash_code_trn in self.x_trn_hash[l].keys(): 116 | self.x_trn_hash[l][hash_code_trn].append(i) 117 | else: 118 | self.x_trn_hash[l][hash_code_trn] = [i] 119 | 120 | def get_approx_KNN(self, x_tst, K): 121 | N_tst = x_tst.shape[0] 122 | x_tst_knn = np.ones((N_tst, K)) * (-1) 123 | nns_len = np.zeros(N_tst) 124 | for i_tst in tqdm(range(N_tst)): 125 | nns = [] 126 | for l in range(self.n_hash_table): 127 | hash_code_int = lsh_function(self.t, x_tst[i_tst] / self.dist_rand, self.w[l, :, :], self.b[l, :]) 128 | hash_code_test = '.'.join(map(str, hash_code_int)) 129 | if hash_code_test in self.x_trn_hash[l].keys(): 130 | nns += self.x_trn_hash[l][hash_code_test] 131 | nns = np.unique(nns) 132 | num_collide_elements = len(nns) 133 | if len(nns) > 0: 134 | dist = [np.linalg.norm(self.x_trn[i] / self.dist_rand - x_tst[i_tst] / self.dist_rand, 2) for i in nns] 135 | dist_min_ind = nns[np.argsort(dist)] 136 | if num_collide_elements < K: 137 | x_tst_knn[i_tst, :num_collide_elements] = dist_min_ind[:num_collide_elements] 138 | else: 139 | x_tst_knn[i_tst, :] = dist_min_ind[:K] 140 | # pdb.set_trace() 141 | nns_len[i_tst] = len(nns) 142 | if i_tst % 100 == 0: 143 | print('get approximate knn %s' % i_tst) 144 | return x_tst_knn.astype(int), nns_len 145 | 146 | def compute_approx_shapley(self, x_tst_knn, y_tst, K): 147 | N_tst, K_star = x_tst_knn.shape 148 | # flag_sufficient = (x_tst_knn[:,-1]>=0) 149 | sp_approx = np.zeros((N_tst, self.N)) 150 | for j in tqdm(range(N_tst)): 151 | non_nan_index = np.where(x_tst_knn[j, :] >= 0)[0] 152 | if len(non_nan_index) == 0: 153 | continue 154 | K_tot = non_nan_index[-1] 155 | if K_tot == self.N: 156 | sp_approx[j, x_tst_knn[j, self.N - 1]] = self.equal(self.y_trn[x_tst_knn[j, self.N - 1]], 157 | y_tst[j]) / self.N 158 | for i in np.arange(K_tot - 1, -1, -1): 159 | sp_approx[j, x_tst_knn[j, i]] = sp_approx[j, x_tst_knn[j, i + 1]] + ( 160 | self.equal(self.y_trn[x_tst_knn[j, i]], y_tst[j]) - self.equal( 161 | self.y_trn[x_tst_knn[j, i + 1]], y_tst[j])) / K * min([K, i + 1]) / (i + 1) 162 | 163 | return sp_approx 164 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/pytorch_fitmodule/fit_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from collections import OrderedDict 4 | from functools import partial 5 | from torch.autograd import Variable 6 | from torch.nn import CrossEntropyLoss, Module 7 | from torch.optim import SGD 8 | import numpy 9 | 10 | from .utils import add_metrics_to_log, get_loader, log_to_message, ProgressBar 11 | 12 | 13 | DEFAULT_LOSS = CrossEntropyLoss() 14 | DEFAULT_OPTIMIZER = partial(SGD, lr=0.001, momentum=0.9) 15 | 16 | 17 | class FitModule(Module): 18 | 19 | def eval_hessian(self, loss_grad): 20 | cnt = 0 21 | for g in loss_grad: 22 | g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)]) 23 | cnt = 1 24 | l = g_vector.size(0) 25 | hessian = torch.zeros(l, l) 26 | for idx in range(l): 27 | g_vector[idx].requires_grad = True 28 | grad2rd = torch.autograd.grad(g_vector[idx], self.parameters(), create_graph=True) 29 | cnt = 0 30 | for g in grad2rd: 31 | g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)]) 32 | cnt = 1 33 | hessian[idx] = g2 34 | return hessian.cpu().data.numpy() 35 | 36 | def fit(self, 37 | X, 38 | y, 39 | batch_size=32, 40 | epochs=10, 41 | verbose=1, 42 | validation_split=0., 43 | validation_data=None, 44 | shuffle=True, 45 | initial_epoch=0, 46 | seed=None, 47 | loss=DEFAULT_LOSS, 48 | optimizer=DEFAULT_OPTIMIZER, 49 | metrics=None): 50 | """Trains the model similar to Keras' .fit(...) method 51 | 52 | # Arguments 53 | X: training data Tensor. 54 | y: target data Tensor.i 55 | batch_size: integer. Number of samples per gradient update. 56 | epochs: integer, the number of times to iterate 57 | over the training data arrays. 58 | verbose: 0, 1. Verbosity mode. 59 | 0 = silent, 1 = verbose. 60 | validation_split: float between 0 and 1: 61 | fraction of the training data to be used as validation data. 62 | The model will set apart this fraction of the training data, 63 | will not train on it, and will evaluate 64 | the loss and any model metrics 65 | on this data at the end of each epoch. 66 | validation_data: (x_val, y_val) tuple on which to evaluate 67 | the loss and any model metrics 68 | at the end of each epoch. The model will not 69 | be trained on this data. 70 | shuffle: boolean, whether to shuffle the training data 71 | before each epoch. 72 | initial_epoch: epoch at which to start training 73 | (useful for resuming a previous training run) 74 | seed: random seed. 75 | optimizer: training optimizer 76 | loss: training loss 77 | metrics: list of functions with signatures `metric(y_true, y_pred)` 78 | where y_true and y_pred are both Tensors 79 | 80 | # Returns 81 | list of OrderedDicts with training metrics 82 | """ 83 | if seed and seed >= 0: 84 | torch.manual_seed(seed) 85 | # Prepare validation data 86 | if validation_data: 87 | X_val, y_val = validation_data 88 | elif validation_split and 0. < validation_split < 1.: 89 | split = int(X.size()[0] * (1. - validation_split)) 90 | X, X_val = X[:split], X[split:] 91 | y, y_val = y[:split], y[split:] 92 | else: 93 | X_val, y_val = None, None 94 | # Build DataLoaders 95 | if isinstance(X, numpy.ndarray): 96 | X = torch.from_numpy(X).float() 97 | if isinstance(y, numpy.ndarray): 98 | y = torch.from_numpy(y).float() 99 | if isinstance(X_val, numpy.ndarray): 100 | X_val = torch.from_numpy(X_val).float() 101 | if isinstance(y_val, numpy.ndarray): 102 | y_val = torch.from_numpy(y_val).float() 103 | train_data = get_loader(X, y, batch_size, shuffle) 104 | # Compile optimizer 105 | opt = optimizer(self.parameters()) 106 | # Run training loop 107 | logs = [] 108 | self.train() 109 | for t in range(initial_epoch, epochs): 110 | if verbose: 111 | print("Epoch {0} / {1}".format(t+1, epochs)) 112 | # Setup logger 113 | if verbose: 114 | pb = ProgressBar(len(train_data)) 115 | log = OrderedDict() 116 | epoch_loss = 0.0 117 | # Run batches 118 | for batch_i, batch_data in enumerate(train_data): 119 | # Get batch data 120 | X_batch = Variable(batch_data[0], requires_grad=True).float() 121 | y_batch = Variable(batch_data[1], requires_grad=True).long() 122 | # Backprop 123 | opt.zero_grad() 124 | y_batch_pred = self(X_batch).float() 125 | batch_loss = loss(y_batch_pred, y_batch) 126 | batch_loss.backward() 127 | opt.step() 128 | # Update status 129 | epoch_loss += batch_loss.item() 130 | for param in self.parameters(): 131 | param.requires_grad = True 132 | # print(y_val) 133 | # hessian = self.eval_hessian(y) 134 | # print(hessian.shape) 135 | # print(hessian) 136 | log['loss'] = float(epoch_loss) / (batch_i + 1) 137 | if verbose: 138 | pb.bar(batch_i, log_to_message(log)) 139 | # Run metrics 140 | if metrics: 141 | y_train_pred = self.predict(X, batch_size) 142 | add_metrics_to_log(log, metrics, y, y_train_pred) 143 | if X_val is not None and y_val is not None: 144 | y_val_pred = self.predict(X_val, batch_size) 145 | val_loss = loss(Variable(y_val_pred, requires_grad=True), Variable(y_val, requires_grad=True)) 146 | log['val_loss'] = val_loss.data[0] 147 | if metrics: 148 | add_metrics_to_log(log, metrics, y_val, y_val_pred, 'val_') 149 | logs.append(log) 150 | if verbose: 151 | pb.close(log_to_message(log)) 152 | return logs 153 | 154 | def predict(self, X, batch_size=32): 155 | """Generates output predictions for the input samples. 156 | 157 | Computation is done in batches. 158 | 159 | # Arguments 160 | X: input data Tensor. 161 | batch_size: integer. 162 | 163 | # Returns 164 | prediction Tensor. 165 | """ 166 | # Build DataLoader 167 | data = get_loader(X, batch_size=batch_size) 168 | # Batch prediction 169 | self.eval() 170 | r, n = 0, X.size()[0] 171 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 172 | for batch_data in data: 173 | # Predict on batch 174 | X_batch = Variable(batch_data[0].type('torch.FloatTensor').to(device), requires_grad=True).type('torch.FloatTensor').to(device) 175 | y_batch_pred = self(X_batch).data.type('torch.FloatTensor') 176 | # Infer prediction shape 177 | if r == 0: 178 | y_pred = (torch.zeros((n,) + y_batch_pred.size()[1:])).data.type('torch.FloatTensor') 179 | # Add to prediction tensor 180 | y_pred[r : min(n, r + batch_size)] = y_batch_pred 181 | r += batch_size 182 | return y_pred 183 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/models/resnet.py: -------------------------------------------------------------------------------- 1 | '''ResNet in PyTorch. 2 | 3 | BasicBlock and Bottleneck module is from the original ResNet paper: 4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 5 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 6 | 7 | PreActBlock and PreActBottleneck module is from the later paper: 8 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 9 | Identity Mappings in Deep Residual Networks. arXiv:1603.05027 10 | ''' 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from pytorch_fitmodule import FitModule 15 | from torch.autograd import Variable 16 | import numpy as np 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 21 | 22 | 23 | class BasicBlock(FitModule): 24 | expansion = 1 25 | 26 | def __init__(self, in_planes, planes, stride=1): 27 | super(BasicBlock, self).__init__() 28 | self.conv1 = conv3x3(in_planes, planes, stride) 29 | self.bn1 = nn.BatchNorm2d(planes) 30 | self.conv2 = conv3x3(planes, planes) 31 | self.bn2 = nn.BatchNorm2d(planes) 32 | 33 | self.shortcut = nn.Sequential() 34 | if stride != 1 or in_planes != self.expansion * planes: 35 | self.shortcut = nn.Sequential( 36 | nn.Conv2d(in_planes, self.expansion * planes, 37 | kernel_size=1, stride=stride, bias=False), 38 | nn.BatchNorm2d(self.expansion * planes) 39 | ) 40 | 41 | def forward(self, x): 42 | out = F.relu(self.bn1(self.conv1(x))) 43 | out = self.bn2(self.conv2(out)) 44 | out += self.shortcut(x) 45 | out = F.relu(out) 46 | return out 47 | 48 | 49 | class PreActBlock(FitModule): 50 | '''Pre-activation version of the BasicBlock.''' 51 | expansion = 1 52 | 53 | def __init__(self, in_planes, planes, stride=1): 54 | super(PreActBlock, self).__init__() 55 | self.bn1 = nn.BatchNorm2d(in_planes) 56 | self.conv1 = conv3x3(in_planes, planes, stride) 57 | self.bn2 = nn.BatchNorm2d(planes) 58 | self.conv2 = conv3x3(planes, planes) 59 | 60 | if stride != 1 or in_planes != self.expansion * planes: 61 | self.shortcut = nn.Sequential( 62 | nn.Conv2d(in_planes, self.expansion * planes, 63 | kernel_size=1, stride=stride, bias=False) 64 | ) 65 | 66 | def forward(self, x): 67 | out = F.relu(self.bn1(x)) 68 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 69 | out = self.conv1(out) 70 | out = self.conv2(F.relu(self.bn2(out))) 71 | out += shortcut 72 | return out 73 | 74 | 75 | class Bottleneck(FitModule): 76 | expansion = 4 77 | 78 | def __init__(self, in_planes, planes, stride=1): 79 | super(Bottleneck, self).__init__() 80 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 81 | self.bn1 = nn.BatchNorm2d(planes) 82 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 83 | self.bn2 = nn.BatchNorm2d(planes) 84 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 85 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 86 | 87 | self.shortcut = nn.Sequential() 88 | if stride != 1 or in_planes != self.expansion * planes: 89 | self.shortcut = nn.Sequential( 90 | nn.Conv2d(in_planes, self.expansion * planes, 91 | kernel_size=1, stride=stride, bias=False), 92 | nn.BatchNorm2d(self.expansion * planes) 93 | ) 94 | 95 | def forward(self, x): 96 | out = F.relu(self.bn1(self.conv1(x))) 97 | out = F.relu(self.bn2(self.conv2(out))) 98 | out = self.bn3(self.conv3(out)) 99 | out += self.shortcut(x) 100 | out = F.relu(out) 101 | return out 102 | 103 | 104 | class PreActBottleneck(FitModule): 105 | '''Pre-activation version of the original Bottleneck module.''' 106 | expansion = 4 107 | 108 | def __init__(self, in_planes, planes, stride=1): 109 | super(PreActBottleneck, self).__init__() 110 | self.bn1 = nn.BatchNorm2d(in_planes) 111 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 112 | self.bn2 = nn.BatchNorm2d(planes) 113 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 114 | self.bn3 = nn.BatchNorm2d(planes) 115 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 116 | 117 | if stride != 1 or in_planes != self.expansion * planes: 118 | self.shortcut = nn.Sequential( 119 | nn.Conv2d(in_planes, self.expansion * planes, 120 | kernel_size=1, stride=stride, bias=False) 121 | ) 122 | 123 | def forward(self, x): 124 | out = F.relu(self.bn1(x)) 125 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 126 | out = self.conv1(out) 127 | out = self.conv2(F.relu(self.bn2(out))) 128 | out = self.conv3(F.relu(self.bn3(out))) 129 | out += shortcut 130 | return out 131 | 132 | 133 | class ResNet(FitModule): 134 | def __init__(self, block, num_blocks, num_classes=10): 135 | super(ResNet, self).__init__() 136 | self.in_planes = 64 137 | 138 | self.conv1 = conv3x3(3, 64) 139 | self.bn1 = nn.BatchNorm2d(64) 140 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 141 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 142 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 143 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 144 | self.linear = nn.Linear(512 * block.expansion, num_classes) 145 | 146 | def _make_layer(self, block, planes, num_blocks, stride): 147 | strides = [stride] + [1] * (num_blocks - 1) 148 | layers = [] 149 | for stride in strides: 150 | layers.append(block(self.in_planes, planes, stride)) 151 | self.in_planes = planes * block.expansion 152 | return nn.Sequential(*layers) 153 | 154 | def freeze_hidden_layers(self): 155 | self._freeze_layer(self.conv1) 156 | self._freeze_layer(self.bn1) 157 | self._freeze_layer(self.layer1) 158 | self._freeze_layer(self.layer2) 159 | self._freeze_layer(self.layer3) 160 | self._freeze_layer(self.layer4) 161 | 162 | def unfreeze_model(self): 163 | self._freeze_layer(self.conv1, freeze=False) 164 | self._freeze_layer(self.bn1, freeze=False) 165 | self._freeze_layer(self.layer1, freeze=False) 166 | self._freeze_layer(self.layer2, freeze=False) 167 | self._freeze_layer(self.layer3, freeze=False) 168 | self._freeze_layer(self.layer4, freeze=False) 169 | self._freeze_layer(self.linear, freeze=False) 170 | 171 | def embed_in_n_layer(self, n): 172 | self._freeze_layer(self.conv1) 173 | self._freeze_layer(self.bn1) 174 | if n == 1: 175 | self._freeze_layer(self.layer1) 176 | elif n == 2: 177 | self._freeze_layer(self.layer2) 178 | elif n == 3: 179 | self._freeze_layer(self.layer3) 180 | elif n == 4: 181 | self._freeze_layer(self.layer4) 182 | else: 183 | self._freeze_layer(self.linear) 184 | 185 | def _freeze_layer(self, layer, freeze=True): 186 | if freeze: 187 | for p in layer.parameters(): 188 | p.requires_grad = False 189 | else: 190 | for p in layer.parameters(): 191 | p.requires_grad = True 192 | 193 | def forward(self, x): 194 | x = x.float() 195 | out = F.relu(self.bn1(self.conv1(x).float()).float()) 196 | out = self.layer1(out) 197 | out = self.layer2(out) 198 | out = self.layer3(out) 199 | out = self.layer4(out) 200 | out = F.avg_pool2d(out, 4) 201 | out = out.view(out.size(0), -1) 202 | out = self.linear(out) 203 | return out 204 | 205 | def score(self, X, y): 206 | if isinstance(X, list): 207 | X = np.array(X) 208 | if isinstance(y, list): 209 | y = np.array(y) 210 | if isinstance(X, np.ndarray): 211 | X = torch.from_numpy(X) 212 | if isinstance(y, np.ndarray): 213 | y = torch.from_numpy(y) 214 | y_pred = self.predict(X) 215 | return np.mean(y.numpy() == np.argmax(y_pred.numpy(), axis=1)) 216 | 217 | 218 | def ResNet18(num_classes=10): 219 | return ResNet(PreActBlock, [2, 2, 2, 2], num_classes) 220 | 221 | 222 | def ResNet34(): 223 | return ResNet(BasicBlock, [3, 4, 6, 3]) 224 | 225 | 226 | def ResNet50(): 227 | return ResNet(Bottleneck, [3, 4, 6, 3]) 228 | 229 | 230 | def ResNet101(): 231 | return ResNet(Bottleneck, [3, 4, 23, 3]) 232 | 233 | 234 | def ResNet152(): 235 | return ResNet(Bottleneck, [3, 8, 36, 3]) 236 | 237 | 238 | def test(): 239 | net = ResNet18() 240 | y = net(Variable(torch.randn(1, 3, 32, 32), requires_grad=True)) 241 | print(y.size()) 242 | 243 | # test() 244 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/PlotPoisoning.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import os 5 | from sklearn.decomposition import PCA 6 | from sklearn.neighbors import KernelDensity 7 | from scipy.stats import multivariate_normal 8 | from sklearn.manifold import TSNE 9 | import seaborn as sns 10 | 11 | sns.set() 12 | 13 | watermarked = pickle.load(open("watermarked.pkl", "rb"), encoding = "iso-8859-1") 14 | # tmc_v = pickle.load(open('tmc.pkl', 'rb'), encoding = "iso-8859-1") 15 | # tmc_i = np.argsort(-tmc_v)[::-1] 16 | # cnt = 0 17 | # f = [] 18 | # total = 0 19 | # cnt = 0 20 | # for i in range(len(tmc_i)): 21 | # if watermarked[int(tmc_i[i])] == 1: 22 | # total += 1 23 | # for i in range(len(tmc_i)): 24 | # if watermarked[int(tmc_i[i])] == 1: 25 | # cnt += 1 26 | # f.append(1.0 * cnt / total) 27 | # x = np.array(range(1, len(tmc_i) + 1)) / len(tmc_i) * 100 28 | # x = np.append(x[0:-1:200], x[-1]) 29 | # f = np.append(f[0:-1:200], f[-1]) 30 | # plt.plot(x, np.array(f) * 100, 's-', color = 'blue', label = "TMC-Shapley") 31 | 32 | # g_v = pickle.load(open('g.pkl', 'rb'), encoding = "iso-8859-1") 33 | # g_i = np.argsort(-g_v)[::-1] 34 | # cnt = 0 35 | # f = [] 36 | # total = 0 37 | # cnt = 0 38 | # for i in range(len(g_i)): 39 | # if watermarked[int(g_i[i])] == 1: 40 | # total += 1 41 | # for i in range(len(g_i)): 42 | # if watermarked[int(g_i[i])] == 1: 43 | # cnt += 1 44 | # f.append(1.0 * cnt / total) 45 | # x = np.array(range(1, len(g_i) + 1)) / len(g_i) * 100 46 | # x = np.append(x[0:-1:200], x[-1]) 47 | # f = np.append(f[0:-1:200], f[-1]) 48 | # plt.plot(x, np.array(f) * 100, 's-', color = 'orange', label = "G-Shapley", zorder=5) 49 | 50 | 51 | # loo_v = pickle.load(open('loo.pkl', 'rb'), encoding = "iso-8859-1")["loo"] 52 | # loo_i = np.argsort(-loo_v)[::-1] 53 | # cnt = 0 54 | # f = [] 55 | # total = 0 56 | # cnt = 0 57 | # for i in range(len(loo_i)): 58 | # if watermarked[int(loo_i[i])] == 1: 59 | # total += 1 60 | # for i in range(len(loo_i)): 61 | # if watermarked[int(loo_i[i])] == 1: 62 | # cnt += 1 63 | # f.append(1.0 * cnt / total) 64 | # x = np.array(range(1, len(loo_i) + 1)) / len(loo_i) * 100 65 | # x = np.append(x[0:-1:200], x[-1]) 66 | # f = np.append(f[0:-1:200], f[-1]) 67 | # plt.plot(x, np.array(f) * 100, '^-', color = 'olive', label = "Leave-One-Out", zorder=4, alpha=0.8) 68 | 69 | # for K in range(10, 11): 70 | # knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 71 | # knn1_v = pickle.load(open('knn_layer1_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 72 | # knn2_v = pickle.load(open('knn_layer2_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 73 | # knn3_v = pickle.load(open('knn_layer3_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 74 | # knn_v = (knn1_v + knn2_v + knn3_v + knn_v) / 4 75 | # knn_i = np.argsort(-knn_v)[::-1] 76 | # cnt = 0 77 | # f = [] 78 | # total = 0 79 | # cnt = 0 80 | # for i in range(len(knn_i)): 81 | # if watermarked[int(knn_i[i])] == 1: 82 | # total += 1 83 | # for i in range(len(knn_i)): 84 | # if watermarked[int(knn_i[i])] == 1: 85 | # cnt += 1 86 | # f.append(1.0 * cnt / total) 87 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 88 | # plt.plot(x, np.array(f) * 100, color = 'violet', label = 'average-KNN-Shapley (k={})'.format(K)) 89 | 90 | # for K in range(10, 11): 91 | # knn_v = pickle.load(open('looknn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 92 | # knn_i = np.argsort(-knn_v)[::-1] 93 | # cnt = 0 94 | # f = [] 95 | # total = 0 96 | # cnt = 0 97 | # a1 = [] 98 | # a0 = [] 99 | # for i in range(len(knn_i)): 100 | # if watermarked[int(knn_i[i])] == 1: 101 | # total += 1 102 | # for i in range(len(knn_i)): 103 | # if watermarked[int(knn_i[i])] == 1: 104 | # cnt += 1 105 | # f.append(1.0 * cnt / total) 106 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 107 | # x = np.append(x[0:-1:200], x[-1]) 108 | # f = np.append(f[0:-1:200], f[-1]) 109 | # plt.plot(x, np.array(f) * 100, 'o-', color='violet', label = 'KNN-LOO-Shapley'.format(K), zorder=6, alpha=0.8) 110 | 111 | 112 | for K in range(10, 11): 113 | knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 114 | knn_v = np.mean(knn_v, axis=1) 115 | knn_i = np.argsort(-knn_v)[::-1] 116 | cnt = 0 117 | f = [] 118 | total = 0 119 | cnt = 0 120 | for i in range(len(knn_i)): 121 | if watermarked[int(knn_i[i])] == 1: 122 | total += 1 123 | for i in range(len(knn_i)): 124 | if watermarked[int(knn_i[i])] == 1: 125 | cnt += 1 126 | f.append(1.0 * cnt / total) 127 | x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 128 | x = np.append(x[0:-1:200], x[-1]) 129 | f = np.append(f[0:-1:200], f[-1]) 130 | plt.plot(x, np.array(f) * 100, 'o-', color='purple', label = 'KNN-Shapley'.format(K), linewidth=3) 131 | # for i in range(len(knn_i)): 132 | # if watermarked[int(knn_i[i])] == 1: 133 | # print(knn_v[knn_i[i]]) 134 | # a1.append(knn_v[knn_i[i]]) 135 | # else: 136 | # a0.append(knn_v[knn_i[i]]) 137 | # plt.hist(a0, bins=30, color='blue', histtype='stepfilled', label = 'benign data') 138 | # plt.hist(a1, bins=30, color='red', histtype='stepfilled', label = 'poisoned data') 139 | # 140 | for K in range(10, 11): 141 | knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 142 | knn_v = np.max(knn_v, axis=1) 143 | knn_i = np.argsort(-knn_v)[::-1] 144 | cnt = 0 145 | f = [] 146 | total = 0 147 | cnt = 0 148 | for i in range(len(knn_i)): 149 | if watermarked[int(knn_i[i])] == 1: 150 | total += 1 151 | for i in range(len(knn_i)): 152 | if watermarked[int(knn_i[i])] == 1: 153 | cnt += 1 154 | f.append(1.0 * cnt / total) 155 | x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 156 | x = np.append(x[0:-1:200], x[-1]) 157 | f = np.append(f[0:-1:200], f[-1]) 158 | plt.plot(x, np.array(f) * 100, 'o-', color='green', label = 'max-KNN-Shapley'.format(K), linewidth=3) 159 | # # a = [] 160 | # # plt.figure(figsize=(12, 6)) 161 | # # plt.subplot(121) 162 | # # for j in range(len(knn_v)): 163 | # # if not watermarked[j]: 164 | # # a.append(np.mean(knn_v[j])) 165 | # # plt.hist(a, bins=100, range=(0, 0.002), color='blue', histtype='stepfilled', label = 'benign (mean value)') 166 | # # a = [] 167 | # # plt.subplot(122) 168 | # # for j in range(len(knn_v)): 169 | # # if watermarked[j]: 170 | # # a.append(np.mean(knn_v[j])) 171 | # # plt.hist(a, bins=100, range=(0, 0.002), color='red', histtype='stepfilled', label = 'watermarked (mean value)') 172 | 173 | # knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1") 174 | # knn_v = np.sort(knn_v, axis=1) 175 | # pca = PCA(n_components=2) 176 | # pca.fit(knn_v) 177 | # knn_v_pca = pca.fit_transform(knn_v) 178 | # # the bandwidth can be tunable 179 | # kde = KernelDensity(kernel='exponential', bandwidth=0.02).fit(knn_v_pca) 180 | # score = kde.score_samples(knn_v_pca) 181 | # knn_i = np.argsort(-score)[::-1] 182 | # cnt = 0 183 | # f = [] 184 | # total = 0 185 | # cnt = 0 186 | # for i in range(len(knn_i)): 187 | # if watermarked[int(knn_i[i])] == 1: 188 | # total += 1 189 | # for i in range(len(knn_i)): 190 | # if watermarked[int(knn_i[i])] == 1: 191 | # cnt += 1 192 | # f.append(1.0 * cnt / total) 193 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 194 | # plt.plot(x, np.array(f) * 100, color = 'blue', label = 'KDE-KNN-Shapley (k={})'.format(K)) 195 | 196 | 197 | # tsne = TSNE(n_components=2,perplexity=50) 198 | # knn_v_tsne = tsne.fit_transform(knn_v) 199 | # knn_mean = np.mean(knn_v_tsne, axis=0) 200 | # knn_cov = np.cov(knn_v_tsne, rowvar=0) 201 | # score = multivariate_normal.pdf(knn_v_tsne, mean=knn_mean, cov=knn_cov) 202 | # knn_i = np.argsort(-score)[::-1] 203 | # cnt = 0 204 | # f = [] 205 | # total = 0 206 | # cnt = 0 207 | # for i in range(len(knn_i)): 208 | # if watermarked[int(knn_i[i])] == 1: 209 | # total += 1 210 | # for i in range(len(knn_i)): 211 | # if watermarked[int(knn_i[i])] == 1: 212 | # cnt += 1 213 | # f.append(1.0 * cnt / total) 214 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 215 | # plt.plot(x, np.array(f) * 100, color = 'darkblue', label = 'TSNE-KNN-Shapley (k={})'.format(K)) 216 | 217 | # pca = PCA(n_components=2) 218 | # pca.fit(knn_v) 219 | # knn_v_pca = pca.fit_transform(knn_v) 220 | # knn_mean = np.mean(knn_v_pca, axis=0) 221 | # knn_cov = np.cov(knn_v_pca, rowvar=0) 222 | # score = multivariate_normal.pdf(knn_v_pca, mean=knn_mean, cov=knn_cov) 223 | # knn_i = np.argsort(-score)[::-1] 224 | # cnt = 0 225 | # f = [] 226 | # total = 0 227 | # cnt = 0 228 | # for i in range(len(knn_i)): 229 | # if watermarked[int(knn_i[i])] == 1: 230 | # total += 1 231 | # for i in range(len(knn_i)): 232 | # if watermarked[int(knn_i[i])] == 1: 233 | # cnt += 1 234 | # f.append(1.0 * cnt / total) 235 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100 236 | # plt.plot(x, np.array(f) * 100, color = 'lightblue', label = 'Gaussian-KNN-Shapley (k={})'.format(K)) 237 | 238 | ran_v = np.random.rand(len(knn_v, )) 239 | ran_i = np.argsort(-ran_v)[::-1] 240 | cnt = 0 241 | f = [] 242 | total = 0 243 | cnt = 0 244 | for i in range(len(ran_i)): 245 | if watermarked[int(ran_i[i])] == 1: 246 | total += 1 247 | for i in range(len(ran_i)): 248 | if watermarked[int(ran_i[i])] == 1: 249 | cnt += 1 250 | f.append(1.0 * cnt / total) 251 | x = np.array(range(1, len(ran_i) + 1)) / len(ran_i) * 100 252 | f = x / 100 253 | plt.plot(x, np.array(f) * 100, '--', color='red', label = "Random", zorder=7) 254 | 255 | plt.xlabel('Fraction of data inspected (%)') 256 | plt.ylabel('Fraction of backdoor images detected (%)') 257 | plt.legend(loc='lower right') 258 | plt.show() -------------------------------------------------------------------------------- /reproduction/Cifar10/accuracy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import bz2\n", 10 | "import numpy as np\n", 11 | "from tqdm import tqdm_notebook as tqdm\n", 12 | "import gzip\n", 13 | "from heapq import heappushpop\n", 14 | "from joblib import Parallel, delayed\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "data = np.load('CIFAR10_resnet50-keras_features.npz')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "x_trn = np.vstack((data['features_training'], data['features_testing']))\n", 34 | "y_trn = np.hstack((data['labels_training'], data['labels_testing']))" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from sklearn.utils import shuffle\n", 44 | "x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "x_trn = np.reshape(x_trn, (-1, 2048))\n", 54 | "x_tst, y_tst = x_trn[:100], y_trn[:100]\n", 55 | "x_val, y_val = x_trn[100:1100], y_trn[100:1100]\n", 56 | "x_trn, y_trn = x_trn[1100:], y_trn[1100:]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 7, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 68 | " metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n", 69 | " weights='uniform')" 70 | ] 71 | }, 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "from sklearn.neighbors import KNeighborsClassifier\n", 79 | "neigh = KNeighborsClassifier(n_neighbors=1)\n", 80 | "neigh.fit(x_trn, y_trn)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 8, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "0.852" 92 | ] 93 | }, 94 | "execution_count": 8, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "neigh.score(x_val, y_val)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 9, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "0.81" 112 | ] 113 | }, 114 | "execution_count": 9, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "neigh.score(x_tst, y_tst)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stderr", 130 | "output_type": "stream", 131 | "text": [ 132 | "/home/wbx/tensorflow/lib/python3.5/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", 133 | " \"of iterations.\", ConvergenceWarning)\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "from sklearn.linear_model import LogisticRegression\n", 139 | "clf = LogisticRegression(random_state=0, solver='lbfgs',\n", 140 | " multi_class='multinomial').fit(x_trn, y_trn)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "0.91" 152 | ] 153 | }, 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "clf.score(x_val, y_val)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "0.87" 172 | ] 173 | }, 174 | "execution_count": 8, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "clf.score(x_tst, y_tst)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 9, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "0.9502886247877759" 192 | ] 193 | }, 194 | "execution_count": 9, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "clf.score(x_trn, y_trn)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 7, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stderr", 210 | "output_type": "stream", 211 | "text": [ 212 | "Using TensorFlow backend.\n" 213 | ] 214 | }, 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Epoch 1/5\n", 220 | "58900/58900 [==============================] - 11s 184us/step - loss: 0.4147 - acc: 0.8590\n", 221 | "Epoch 2/5\n", 222 | "58900/58900 [==============================] - 7s 113us/step - loss: 0.2936 - acc: 0.8993\n", 223 | "Epoch 3/5\n", 224 | "58900/58900 [==============================] - 7s 112us/step - loss: 0.2668 - acc: 0.9075\n", 225 | "Epoch 4/5\n", 226 | "58900/58900 [==============================] - 7s 114us/step - loss: 0.2493 - acc: 0.9139\n", 227 | "Epoch 5/5\n", 228 | "58900/58900 [==============================] - 7s 112us/step - loss: 0.2363 - acc: 0.9176\n" 229 | ] 230 | }, 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "" 235 | ] 236 | }, 237 | "execution_count": 7, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "import os\n", 244 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 245 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"5\"\n", 246 | "\n", 247 | "from keras.models import Sequential\n", 248 | "from keras.layers import Dense, Activation\n", 249 | "\n", 250 | "model = Sequential()\n", 251 | "model.add(Dense(10, input_dim=2048))\n", 252 | "model.add(Activation('softmax'))\n", 253 | "\n", 254 | "model.compile(loss='sparse_categorical_crossentropy',\n", 255 | " optimizer='sgd',\n", 256 | " metrics=['accuracy'])\n", 257 | "\n", 258 | "model.fit(x_trn, y_trn, epochs=5, batch_size=32)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 8, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "\r", 271 | "100/100 [==============================] - 0s 505us/step\n" 272 | ] 273 | }, 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "[0.39001956582069397, 0.8700000047683716]" 278 | ] 279 | }, 280 | "execution_count": 8, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "model.evaluate(x_tst, y_tst, batch_size=128)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 9, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "1000/1000 [==============================] - 0s 35us/step\n" 299 | ] 300 | }, 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "[0.26140422391891477, 0.911]" 305 | ] 306 | }, 307 | "execution_count": 9, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "model.evaluate(x_val, y_val, batch_size=128)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 10, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "Epoch 1/5\n", 326 | "58900/58900 [==============================] - 7s 115us/step - loss: 0.2279 - acc: 0.9210\n", 327 | "Epoch 2/5\n", 328 | "58900/58900 [==============================] - 6s 109us/step - loss: 0.2187 - acc: 0.9253\n", 329 | "Epoch 3/5\n", 330 | "58900/58900 [==============================] - 7s 121us/step - loss: 0.2126 - acc: 0.9269\n", 331 | "Epoch 4/5\n", 332 | "58900/58900 [==============================] - 7s 113us/step - loss: 0.2066 - acc: 0.9289\n", 333 | "Epoch 5/5\n", 334 | "58900/58900 [==============================] - 6s 110us/step - loss: 0.2015 - acc: 0.9309\n" 335 | ] 336 | }, 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "" 341 | ] 342 | }, 343 | "execution_count": 10, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "model.fit(x_trn, y_trn, epochs=5, batch_size=32)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 11, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "\r", 362 | "100/100 [==============================] - 0s 47us/step\n" 363 | ] 364 | }, 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "[0.36326342821121216, 0.8700000047683716]" 369 | ] 370 | }, 371 | "execution_count": 11, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "model.evaluate(x_tst, y_tst, batch_size=128)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 12, 383 | "metadata": {}, 384 | "outputs": [ 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | "1000/1000 [==============================] - 0s 30us/step\n" 390 | ] 391 | }, 392 | { 393 | "data": { 394 | "text/plain": [ 395 | "[0.2430015230178833, 0.9180000023841858]" 396 | ] 397 | }, 398 | "execution_count": 12, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "model.evaluate(x_val, y_val, batch_size=128)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 13, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "name": "stdout", 414 | "output_type": "stream", 415 | "text": [ 416 | "Epoch 1/5\n", 417 | "58900/58900 [==============================] - 7s 117us/step - loss: 0.1968 - acc: 0.9324\n", 418 | "Epoch 2/5\n", 419 | "58900/58900 [==============================] - 7s 112us/step - loss: 0.1929 - acc: 0.9346\n", 420 | "Epoch 3/5\n", 421 | "58900/58900 [==============================] - 6s 96us/step - loss: 0.1887 - acc: 0.9349\n", 422 | "Epoch 4/5\n", 423 | "58900/58900 [==============================] - 6s 96us/step - loss: 0.1857 - acc: 0.9359\n", 424 | "Epoch 5/5\n", 425 | "58900/58900 [==============================] - 7s 113us/step - loss: 0.1823 - acc: 0.9382\n" 426 | ] 427 | }, 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "" 432 | ] 433 | }, 434 | "execution_count": 13, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "model.fit(x_trn, y_trn, epochs=5, batch_size=32)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 14, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "name": "stdout", 450 | "output_type": "stream", 451 | "text": [ 452 | "\r", 453 | "100/100 [==============================] - 0s 210us/step\n" 454 | ] 455 | }, 456 | { 457 | "data": { 458 | "text/plain": [ 459 | "[0.34444886445999146, 0.8600000143051147]" 460 | ] 461 | }, 462 | "execution_count": 14, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "model.evaluate(x_tst, y_tst, batch_size=128)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 16, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "1000/1000 [==============================] - 0s 30us/step\n" 481 | ] 482 | }, 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "[0.25385982036590576, 0.9090000023841858]" 487 | ] 488 | }, 489 | "execution_count": 16, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "model.evaluate(x_val, y_val, batch_size=128)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [] 504 | } 505 | ], 506 | "metadata": { 507 | "kernelspec": { 508 | "display_name": "Python 3", 509 | "language": "python", 510 | "name": "python3" 511 | }, 512 | "language_info": { 513 | "codemirror_mode": { 514 | "name": "ipython", 515 | "version": 3 516 | }, 517 | "file_extension": ".py", 518 | "mimetype": "text/x-python", 519 | "name": "python", 520 | "nbconvert_exporter": "python", 521 | "pygments_lexer": "ipython3", 522 | "version": "3.5.2" 523 | } 524 | }, 525 | "nbformat": 4, 526 | "nbformat_minor": 2 527 | } 528 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/shap_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from scipy.stats import logistic 5 | from scipy.stats import spearmanr 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.linear_model import LinearRegression, Ridge 9 | from sklearn.metrics import r2_score 10 | from sklearn.neural_network import MLPRegressor, MLPClassifier 11 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 12 | from sklearn.neighbors import KNeighborsClassifier 13 | from sklearn.tree import DecisionTreeClassifier 14 | from sklearn.naive_bayes import MultinomialNB, GaussianNB 15 | from sklearn.gaussian_process import GaussianProcessClassifier 16 | from sklearn.svm import SVC, LinearSVC 17 | from sklearn.base import clone 18 | import inspect 19 | from Shapley import ShapNN, CShapNN 20 | from multiprocessing import dummy as multiprocessing 21 | from sklearn.metrics import roc_auc_score, f1_score 22 | import warnings 23 | import tensorflow as tf 24 | import matplotlib.pyplot as plt 25 | import torch 26 | import torch.nn as nn 27 | import torch.nn.functional as F 28 | from torch.autograd import Variable 29 | from models import ResNet18 30 | 31 | def convergence_plots(marginals): 32 | 33 | plt.rcParams['figure.figsize'] = 15,15 34 | for i, idx in enumerate(np.arange(min(25, marginals.shape[-1]))): 35 | plt.subplot(5,5,i+1) 36 | plt.plot(np.cumsum(marginals[:, idx])/np.arange(1, len(marginals)+1)) 37 | 38 | 39 | def is_integer(array): 40 | return (np.equal(np.mod(array, 1), 0).mean()==1) 41 | 42 | 43 | def is_fitted(model): 44 | """Checks if model object has any attributes ending with an underscore""" 45 | return 0 < len( [k for k,v in inspect.getmembers(model) if k.endswith('_') and not k.startswith('__')] ) 46 | 47 | 48 | def return_model(mode, **kwargs): 49 | 50 | if mode=='logistic': 51 | solver = kwargs.get('solver', 'liblinear') 52 | n_jobs = kwargs.get('n_jobs', None) 53 | max_iter = kwargs.get('max_iter', 5000) 54 | model = LogisticRegression(solver=solver, n_jobs=n_jobs, 55 | max_iter=max_iter, random_state=666, 56 | multi_class='auto') 57 | elif mode=='Tree': 58 | model = DecisionTreeClassifier(random_state=666) 59 | elif mode=='RandomForest': 60 | n_estimators = kwargs.get('n_estimators', 50) 61 | model = RandomForestClassifier(n_estimators=n_estimators, random_state=666) 62 | elif mode=='GB': 63 | n_estimators = kwargs.get('n_estimators', 50) 64 | model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666) 65 | elif mode=='AdaBoost': 66 | n_estimators = kwargs.get('n_estimators', 50) 67 | model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666) 68 | elif mode=='SVC': 69 | kernel = kwargs.get('kernel', 'rbf') 70 | model = SVC(kernel=kernel, random_state=666) 71 | elif mode=='LinearSVC': 72 | model = LinearSVC(loss='hinge', random_state=666) 73 | elif mode=='GP': 74 | model = GaussianProcessClassifier(random_state=666) 75 | elif mode=='KNN': 76 | n_neighbors = kwargs.get('n_neighbors', 5) 77 | model = KNeighborsClassifier(n_neighbors=n_neighbors) 78 | elif mode=='NB': 79 | model = MultinomialNB() 80 | elif mode=='linear': 81 | model = LinearRegression(random_state=666) 82 | elif mode=='ridge': 83 | alpha = kwargs.get('alpha', 1.0) 84 | model = Ridge(alpha=alpha, random_state=666) 85 | elif mode=='ResNet': 86 | model = ResNet18(num_classes=kwargs.get('num_classes', 10)) 87 | elif 'conv' in mode: 88 | tf.reset_default_graph() 89 | address = kwargs.get('address', 'weights/conv') 90 | hidden_units = kwargs.get('hidden_layer_sizes', [20]) 91 | activation = kwargs.get('activation', 'relu') 92 | weight_decay = kwargs.get('weight_decay', 1e-4) 93 | learning_rate = kwargs.get('learning_rate', 0.001) 94 | max_iter = kwargs.get('max_iter', 1000) 95 | early_stopping= kwargs.get('early_stopping', 10) 96 | warm_start = kwargs.get('warm_start', False) 97 | batch_size = kwargs.get('batch_size', 256) 98 | kernel_sizes = kwargs.get('kernel_sizes', [5]) 99 | strides = kwargs.get('strides', [5]) 100 | channels = kwargs.get('channels', [1]) 101 | validation_fraction = kwargs.get('validation_fraction', 0.) 102 | global_averaging = kwargs.get('global_averaging', 0.) 103 | optimizer = kwargs.get('optimizer', 'sgd') 104 | if mode=='conv': 105 | model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter, 106 | learning_rate=learning_rate, 107 | weight_decay=weight_decay, validation_fraction=validation_fraction, 108 | early_stopping=early_stopping, 109 | optimizer=optimizer, warm_start=warm_start, address=address, 110 | hidden_units=hidden_units, 111 | strides=strides, global_averaging=global_averaging, 112 | kernel_sizes=kernel_sizes, channels=channels, random_seed=666) 113 | elif mode=='conv_reg': 114 | model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter, 115 | learning_rate=learning_rate, 116 | weight_decay=weight_decay, validation_fraction=validation_fraction, 117 | early_stopping=early_stopping, 118 | optimizer=optimizer, warm_start=warm_start, address=address, 119 | hidden_units=hidden_units, 120 | strides=strides, global_averaging=global_averaging, 121 | kernel_sizes=kernel_sizes, channels=channels, random_seed=666) 122 | elif 'NN' in mode: 123 | solver = kwargs.get('solver', 'sgd') 124 | hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,)) 125 | if isinstance(hidden_layer_sizes, list): 126 | hidden_layer_sizes = list(hidden_layer_sizes) 127 | activation = kwargs.get('activation', 'relu') 128 | learning_rate_init = kwargs.get('learning_rate', 0.001) 129 | max_iter = kwargs.get('max_iter', 5000) 130 | early_stopping= kwargs.get('early_stopping', False) 131 | warm_start = kwargs.get('warm_start', False) 132 | if mode=='NN': 133 | model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, 134 | activation=activation, learning_rate_init=learning_rate_init, 135 | warm_start = warm_start, max_iter=max_iter, 136 | early_stopping=early_stopping) 137 | if mode=='NN_reg': 138 | model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes, 139 | activation=activation, learning_rate_init=learning_rate_init, 140 | warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) 141 | else: 142 | raise ValueError("Invalid mode!") 143 | return model 144 | 145 | 146 | 147 | def generate_features(latent, dependency): 148 | 149 | features = [] 150 | n = latent.shape[0] 151 | exp = latent 152 | holder = latent 153 | for order in range(1,dependency+1): 154 | features.append(np.reshape(holder,[n,-1])) 155 | exp = np.expand_dims(exp,-1) 156 | holder = exp * np.expand_dims(holder,1) 157 | return np.concatenate(features,axis=-1) 158 | 159 | 160 | def label_generator(problem, X, param, difficulty=1, beta=None, important=None): 161 | 162 | if important is None or important > X.shape[-1]: 163 | important = X.shape[-1] 164 | dim_latent = sum([important**i for i in range(1, difficulty+1)]) 165 | if beta is None: 166 | beta = np.random.normal(size=[1, dim_latent]) 167 | important_dims = np.random.choice(X.shape[-1], important, replace=False) 168 | funct_init = lambda inp: np.sum(beta * generate_features(inp[:,important_dims], difficulty), -1) 169 | batch_size = max(100, min(len(X), 10000000//dim_latent)) 170 | y_true = np.zeros(len(X)) 171 | while True: 172 | try: 173 | for itr in range(int(np.ceil(len(X)/batch_size))): 174 | y_true[itr * batch_size: (itr+1) * batch_size] = funct_init( 175 | X[itr * batch_size: (itr+1) * batch_size]) 176 | break 177 | except MemoryError: 178 | batch_size = batch_size//2 179 | mean, std = np.mean(y_true), np.std(y_true) 180 | funct = lambda x: (np.sum(beta * generate_features( 181 | x[:, important_dims], difficulty), -1) - mean) / std 182 | y_true = (y_true - mean)/std 183 | if problem is 'classification': 184 | y_true = logistic.cdf(param * y_true) 185 | y = (np.random.random(X.shape[0]) < y_true).astype(int) 186 | elif problem is 'regression': 187 | y = y_true + param * np.random.normal(size=len(y_true)) 188 | else: 189 | raise ValueError('Invalid problem specified!') 190 | return beta, y, y_true, funct 191 | 192 | 193 | def one_iteration(clf, X, y, X_test, y_test, mean_score, tol=0.0, c=None, metric='accuracy'): 194 | """Runs one iteration of TMC-Shapley.""" 195 | 196 | if metric == 'auc': 197 | def score_func(clf, a, b): 198 | return roc_auc_score(b, clf.predict_proba(a)[:,1]) 199 | elif metric == 'accuracy': 200 | def score_func(clf, a, b): 201 | return clf.score(a, b) 202 | else: 203 | raise ValueError("Wrong metric!") 204 | if c is None: 205 | c = {i:np.array([i]) for i in range(len(X))} 206 | idxs, marginal_contribs = np.random.permutation(len(c.keys())), np.zeros(len(X)) 207 | new_score = np.max(np.bincount(y)) * 1./len(y) if np.mean(y//1 == y/1)==1 else 0. 208 | start = 0 209 | if start: 210 | X_batch, y_batch =\ 211 | np.concatenate([X[c[idx]] for idx in idxs[:start]]), np.concatenate([y[c[idx]] for idx in idxs[:start]]) 212 | else: 213 | X_batch, y_batch = np.zeros((0,) + tuple(X.shape[1:])), np.zeros(0).astype(int) 214 | for n, idx in enumerate(idxs[start:]): 215 | try: 216 | clf = clone(clf) 217 | except: 218 | clf.fit(np.zeros((0,) + X.shape[1:]), y) 219 | old_score = new_score 220 | X_batch, y_batch = np.concatenate([X_batch, X[c[idx]]]), np.concatenate([y_batch, y[c[idx]]]) 221 | with warnings.catch_warnings(): 222 | warnings.simplefilter("ignore") 223 | try: 224 | clf.fit(X_batch, y_batch) 225 | temp_score = score_func(clf, X_test, y_test) 226 | if temp_score>-1 and temp_score<1.: #Removing measningless r2 scores 227 | new_score = temp_score 228 | except: 229 | continue 230 | marginal_contribs[c[idx]] = (new_score - old_score)/len(c[idx]) 231 | if np.abs(new_score - mean_score)/mean_score < tol: 232 | break 233 | return marginal_contribs, idxs 234 | 235 | 236 | def marginals(clf, X, y, X_test, y_test, c=None, tol=0., trials=3000, mean_score=None, metric='accuracy'): 237 | 238 | if metric == 'auc': 239 | def score_func(clf, a, b): 240 | return roc_auc_score(b, clf.predict_proba(a)[:,1]) 241 | elif metric == 'accuracy': 242 | def score_func(clf, a, b): 243 | return clf.score(a, b) 244 | else: 245 | raise ValueError("Wrong metric!") 246 | if mean_score is None: 247 | accs = [] 248 | for _ in range(100): 249 | bag_idxs = np.random.choice(len(y_test), len(y_test)) 250 | accs.append(score_func(clf, X_test[bag_idxs], y_test[bag_idxs])) 251 | mean_score = np.mean(accs) 252 | marginals, idxs = [], [] 253 | for trial in range(trials): 254 | if 10*(trial+1)/trials % 1 == 0: 255 | print('{} out of {}'.format(trial + 1, trials)) 256 | marginal, idx = one_iteration(clf, X, y, X_test, y_test, mean_score, tol=tol, c=c, metric=metric) 257 | marginals.append(marginal) 258 | idxs.append(idx) 259 | return np.array(marginals), np.array(idxs) 260 | 261 | def shapley(mode, X, y, X_test, y_test, stop=None, tol=0., trials=3000, **kwargs): 262 | 263 | try: 264 | vals = np.zeros(len(X)) 265 | example_idxs = np.random.choice(len(X), min(25, len(X)), replace=False) 266 | example_marginals = np.zeros((trials, len(example_idxs))) 267 | for i in range(trials): 268 | print(i) 269 | output = one_pass(mode, X, y, X_test, y_test, tol=tol, stop=stop, **kwargs) 270 | example_marginals[i] = output[0][example_idxs] 271 | vals = vals/(i+1) + output[0]/(i+1) 272 | return vals, example_marginals 273 | except KeyboardInterrupt: 274 | print('Interrupted!') 275 | return vals, example_marginals 276 | 277 | def early_stopping(marginals, idxs, stopping): 278 | 279 | stopped_marginals = np.zeros_like(marginals) 280 | for i in range(len(marginals)): 281 | stopped_marginals[i][idxs[i][:stopping]] = marginals[i][idxs[i][:stopping]] 282 | return np.mean(stopped_marginals, 0) 283 | 284 | def error(mem): 285 | 286 | if len(mem) < 100: 287 | return 1.0 288 | all_vals = (np.cumsum(mem, 0)/np.reshape(np.arange(1, len(mem)+1), (-1,1)))[-100:] 289 | errors = np.mean(np.abs(all_vals[-100:] - all_vals[-1:])/(np.abs(all_vals[-1:]) + 1e-12), -1) 290 | return np.max(errors) 291 | 292 | def my_accuracy_score(clf, X, y): 293 | 294 | probs = clf.predict_proba(X) 295 | predictions = np.argmax(probs, -1) 296 | return np.mean(np.equal(predictions, y)) 297 | 298 | def my_f1_score(clf, X, y): 299 | 300 | predictions = clf.predict(x) 301 | if len(set(y)) == 2: 302 | return f1_score(y, predictions) 303 | return f1_score(y, predictions, average='macro') 304 | 305 | def my_auc_score(clf, X, y): 306 | 307 | probs = clf.predict_proba(X) 308 | true_probs = probs[np.arange(len(y)), y] 309 | return roc_auc_score(y, true_probs) 310 | 311 | def my_xe_score(clf, X, y): 312 | 313 | probs = clf.predict_proba(X) 314 | true_probs = probs[np.arange(len(y)), y] 315 | true_log_probs = np.log(np.clip(true_probs, 1e-12, None)) 316 | return np.mean(true_log_probs) 317 | -------------------------------------------------------------------------------- /use_case/DataAcquisition/shap_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from scipy.stats import logistic 5 | from scipy.stats import spearmanr 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.linear_model import LinearRegression, Ridge 9 | from sklearn.metrics import r2_score 10 | from sklearn.neural_network import MLPRegressor, MLPClassifier 11 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 12 | from sklearn.ensemble import RandomForestRegressor 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.tree import DecisionTreeClassifier 15 | from sklearn.naive_bayes import MultinomialNB, GaussianNB 16 | from sklearn.gaussian_process import GaussianProcessClassifier 17 | from sklearn.svm import SVC, LinearSVC 18 | from sklearn import svm 19 | from sklearn.base import clone 20 | from Shapley import ShapNN, CShapNN 21 | from multiprocessing import dummy as multiprocessing 22 | from sklearn.metrics import roc_auc_score, f1_score 23 | import warnings 24 | import tensorflow as tf 25 | import matplotlib.pyplot as plt 26 | 27 | def convergence_plots(marginals): 28 | 29 | plt.figure(figsize=(20,20)) 30 | # plt.rcParams['figure.figsize'] = 15,15 31 | for i, idx in enumerate(np.arange(min(25, marginals.shape[-1]))): 32 | plt.subplot(5,5,i+1) 33 | plt.plot(np.cumsum(marginals[:, idx])/np.arange(1, len(marginals)+1)) 34 | plt.savefig('temp.png') 35 | 36 | def is_integer(array): 37 | return (np.equal(np.mod(array, 1), 0).mean()==1) 38 | 39 | 40 | def is_fitted(model): 41 | """Checks if model object has any attributes ending with an underscore""" 42 | return 0 < len( [k for k,v in inspect.getmembers(model) if k.endswith('_') and not k.startswith('__')] ) 43 | 44 | 45 | def return_model(mode, **kwargs): 46 | if mode=='logistic': 47 | solver = kwargs.get('solver', 'liblinear') 48 | n_jobs = kwargs.get('n_jobs', None) 49 | max_iter = kwargs.get('max_iter', 5000) 50 | model = LogisticRegression(solver=solver, n_jobs=n_jobs, 51 | max_iter=max_iter, random_state=666, multi_class='auto') 52 | elif mode=='Tree': 53 | model = DecisionTreeClassifier(random_state=666) 54 | elif mode=='RandomForest': 55 | n_estimators = kwargs.get('n_estimators', 25) 56 | model = RandomForestClassifier(n_estimators=n_estimators, random_state=666) 57 | elif mode=='RandomForestReg': 58 | n_estimators = kwargs.get('n_estimators', 50) 59 | # model = RandomForestRegressor(max_depth=4, n_estimators=n_estimators, random_state=666) 60 | # n_estimators = kwargs.get('n_estimators', 50) 61 | model = RandomForestRegressor(max_depth=100, n_estimators=n_estimators, random_state=666)#, min_samples_split = 0.05, min_samples_leaf = 0.001) 62 | # model = RandomForestClassifier(n_estimators=100, criterion = 'gini', max_features = None, min_samples_split = 0.05, min_samples_leaf = 0.001) 63 | elif mode == 'mlpreg': 64 | model = MLPRegressor(hidden_layer_sizes=(10,100), activation='relu', solver='adam', alpha=0.001,batch_size='auto', 65 | learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True, 66 | random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 67 | nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 68 | epsilon=1e-08) 69 | 70 | elif mode=='GB': 71 | n_estimators = kwargs.get('n_estimators', 50) 72 | model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666) 73 | elif mode=='AdaBoost': 74 | n_estimators = kwargs.get('n_estimators', 50) 75 | model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666) 76 | elif mode=='SVC': 77 | kernel = kwargs.get('kernel', 'rbf') 78 | model = SVC(kernel=kernel, random_state=666) 79 | elif mode=='LinearSVC': 80 | model = LinearSVC(loss='hinge', random_state=666) 81 | elif mode=='GP': 82 | model = GaussianProcessClassifier(random_state=666) 83 | elif mode=='KNN': 84 | n_neighbors = kwargs.get('n_neighbors', 5) 85 | model = KNeighborsClassifier(n_neighbors=n_neighbors) 86 | elif mode=='NB': 87 | model = MultinomialNB() 88 | elif mode=='linear': 89 | model = LinearRegression(random_state=666) 90 | elif mode=='ridge': 91 | alpha = kwargs.get('alpha', 1.0) 92 | model = Ridge(alpha=alpha, random_state=666) 93 | elif mode=='uci': 94 | model = MLPClassifier(activation = 'logistic', solver='lbfgs', 95 | alpha=1e-4, hidden_layer_sizes=(6, 100), early_stopping=False, 96 | max_iter= 5000, 97 | random_state=666, warm_start = False) 98 | elif 'conv' in mode: 99 | tf.reset_default_graph() 100 | address = kwargs.get('address', 'weights/conv') 101 | hidden_units = kwargs.get('hidden_layer_sizes', [20]) 102 | activation = kwargs.get('activation', 'relu') 103 | weight_decay = kwargs.get('weight_decay', 1e-4) 104 | learning_rate = kwargs.get('learning_rate', 0.001) 105 | max_iter = kwargs.get('max_iter', 1000) 106 | early_stopping= kwargs.get('early_stopping', 10) 107 | warm_start = kwargs.get('warm_start', False) 108 | batch_size = kwargs.get('batch_size', 256) 109 | kernel_sizes = kwargs.get('kernel_sizes', [5]) 110 | strides = kwargs.get('strides', [5]) 111 | channels = kwargs.get('channels', [1]) 112 | validation_fraction = kwargs.get('validation_fraction', 0.) 113 | global_averaging = kwargs.get('global_averaging', 0.) 114 | optimizer = kwargs.get('optimizer', 'sgd') 115 | if mode=='conv': 116 | model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter, 117 | learning_rate=learning_rate, 118 | weight_decay=weight_decay, validation_fraction=validation_fraction, 119 | early_stopping=early_stopping, 120 | optimizer=optimizer, warm_start=warm_start, address=address, 121 | hidden_units=hidden_units, 122 | strides=strides, global_averaging=global_averaging, 123 | kernel_sizes=kernel_sizes, channels=channels, random_seed=666) 124 | elif mode=='conv_reg': 125 | model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter, 126 | learning_rate=learning_rate, 127 | weight_decay=weight_decay, validation_fraction=validation_fraction, 128 | early_stopping=early_stopping, 129 | optimizer=optimizer, warm_start=warm_start, address=address, 130 | hidden_units=hidden_units, 131 | strides=strides, global_averaging=global_averaging, 132 | kernel_sizes=kernel_sizes, channels=channels, random_seed=666) 133 | elif 'NN' in mode: 134 | solver = kwargs.get('solver', 'adam') 135 | hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,)) 136 | if isinstance(hidden_layer_sizes, list): 137 | hidden_layer_sizes = list(hidden_layer_sizes) 138 | activation = kwargs.get('activation', 'relu') 139 | learning_rate_init = kwargs.get('learning_rate', 0.001) 140 | max_iter = kwargs.get('max_iter', 5000) 141 | early_stopping= kwargs.get('early_stopping', False) 142 | warm_start = kwargs.get('warm_start', False) 143 | if mode=='NN': 144 | model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, 145 | activation=activation, learning_rate_init=learning_rate_init, 146 | warm_start = warm_start, max_iter=max_iter, verbose=False, 147 | early_stopping=early_stopping) 148 | if mode=='NN_reg': 149 | model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes, 150 | activation=activation, learning_rate_init=learning_rate_init, 151 | warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) 152 | else: 153 | raise ValueError("Invalid mode!") 154 | return model 155 | 156 | 157 | 158 | def generate_features(latent, dependency): 159 | 160 | features = [] 161 | n = latent.shape[0] 162 | exp = latent 163 | holder = latent 164 | for order in range(1,dependency+1): 165 | features.append(np.reshape(holder,[n,-1])) 166 | exp = np.expand_dims(exp,-1) 167 | holder = exp * np.expand_dims(holder,1) 168 | return np.concatenate(features,axis=-1) 169 | 170 | 171 | def label_generator(problem, X, param, difficulty=1, beta=None, important=None): 172 | 173 | if important is None or important > X.shape[-1]: 174 | important = X.shape[-1] 175 | dim_latent = sum([important**i for i in range(1, difficulty+1)]) 176 | if beta is None: 177 | beta = np.random.normal(size=[1, dim_latent]) 178 | important_dims = np.random.choice(X.shape[-1], important, replace=False) 179 | funct_init = lambda inp: np.sum(beta * generate_features(inp[:,important_dims], difficulty), -1) 180 | batch_size = max(100, min(len(X), 10000000//dim_latent)) 181 | y_true = np.zeros(len(X)) 182 | while True: 183 | try: 184 | for itr in range(int(np.ceil(len(X)/batch_size))): 185 | y_true[itr * batch_size: (itr+1) * batch_size] = funct_init( 186 | X[itr * batch_size: (itr+1) * batch_size]) 187 | break 188 | except MemoryError: 189 | batch_size = batch_size//2 190 | # print(y_true[:10]) 191 | mean, std = np.mean(y_true), np.std(y_true) 192 | funct = lambda x: (np.sum(beta * generate_features( 193 | x[:, important_dims], difficulty), -1) - mean) / std 194 | y_true = (y_true - mean)/std 195 | if problem is 'classification': 196 | y_true = logistic.cdf(param * y_true) 197 | y = (np.random.random(X.shape[0]) < y_true).astype(int) 198 | elif problem is 'regression': 199 | y = y_true + param * np.random.normal(size=len(y_true)) 200 | else: 201 | raise ValueError('Invalid problem specified!') 202 | # print("beta\ty\t\t_true\tfunct") 203 | # print(beta,y[:10],y_true[:10],funct) 204 | return beta, y, y_true, funct 205 | 206 | 207 | def one_iteration(clf, X, y, X_test, y_test, mean_score, tol=0.0, c=None, metric='accuracy'): 208 | """Runs one iteration of TMC-Shapley.""" 209 | 210 | if metric == 'auc': 211 | def score_func(clf, a, b): 212 | return roc_auc_score(b, clf.predict_proba(a)[:,1]) 213 | elif metric == 'accuracy': 214 | def score_func(clf, a, b): 215 | return clf.score(a, b) 216 | else: 217 | raise ValueError("Wrong metric!") 218 | if c is None: 219 | c = {i:np.array([i]) for i in range(len(X))} 220 | idxs, marginal_contribs = np.random.permutation(len(c.keys())), np.zeros(len(X)) 221 | new_score = np.max(np.bincount(y)) * 1./len(y) if np.mean(y//1 == y/1)==1 else 0. 222 | start = 0 223 | if start: 224 | X_batch, y_batch =\ 225 | np.concatenate([X[c[idx]] for idx in idxs[:start]]), np.concatenate([y[c[idx]] for idx in idxs[:start]]) 226 | else: 227 | X_batch, y_batch = np.zeros((0,) + tuple(X.shape[1:])), np.zeros(0).astype(int) 228 | for n, idx in enumerate(idxs[start:]): 229 | try: 230 | clf = clone(clf) 231 | except: 232 | clf.fit(np.zeros((0,) + X.shape[1:]), y) 233 | old_score = new_score 234 | X_batch, y_batch = np.concatenate([X_batch, X[c[idx]]]), np.concatenate([y_batch, y[c[idx]]]) 235 | with warnings.catch_warnings(): 236 | warnings.simplefilter("ignore") 237 | try: 238 | clf.fit(X_batch, y_batch) 239 | temp_score = score_func(clf, X_test, y_test) 240 | if temp_score>-1 and temp_score<1.: #Removing measningless r2 scores 241 | new_score = temp_score 242 | except: 243 | continue 244 | marginal_contribs[c[idx]] = (new_score - old_score)/len(c[idx]) 245 | if np.abs(new_score - mean_score)/mean_score < tol: 246 | break 247 | return marginal_contribs, idxs 248 | 249 | 250 | def marginals(clf, X, y, X_test, y_test, c=None, tol=0., trials=3000, mean_score=None, metric='accuracy'): 251 | 252 | if metric == 'auc': 253 | def score_func(clf, a, b): 254 | return roc_auc_score(b, clf.predict_proba(a)[:,1]) 255 | elif metric == 'accuracy': 256 | def score_func(clf, a, b): 257 | return clf.score(a, b) 258 | else: 259 | raise ValueError("Wrong metric!") 260 | if mean_score is None: 261 | accs = [] 262 | for _ in range(100): 263 | bag_idxs = np.random.choice(len(y_test), len(y_test)) 264 | accs.append(score_func(clf, X_test[bag_idxs], y_test[bag_idxs])) 265 | mean_score = np.mean(accs) 266 | marginals, idxs = [], [] 267 | for trial in range(trials): 268 | if 10*(trial+1)/trials % 1 == 0: 269 | print('{} out of {}'.format(trial + 1, trials)) 270 | marginal, idx = one_iteration(clf, X, y, X_test, y_test, mean_score, tol=tol, c=c, metric=metric) 271 | marginals.append(marginal) 272 | idxs.append(idx) 273 | return np.array(marginals), np.array(idxs) 274 | 275 | def shapley(mode, X, y, X_test, y_test, stop=None, tol=0., trials=3000, **kwargs): 276 | 277 | try: 278 | vals = np.zeros(len(X)) 279 | example_idxs = np.random.choice(len(X), min(25, len(X)), replace=False) 280 | example_marginals = np.zeros((trials, len(example_idxs))) 281 | for i in range(trials): 282 | print(i) 283 | output = one_pass(mode, X, y, X_test, y_test, tol=tol, stop=stop, **kwargs) 284 | example_marginals[i] = output[0][example_idxs] 285 | vals = vals/(i+1) + output[0]/(i+1) 286 | return vals, example_marginals 287 | except KeyboardInterrupt: 288 | print('Interrupted!') 289 | return vals, example_marginals 290 | 291 | def early_stopping(marginals, idxs, stopping): 292 | 293 | stopped_marginals = np.zeros_like(marginals) 294 | for i in range(len(marginals)): 295 | stopped_marginals[i][idxs[i][:stopping]] = marginals[i][idxs[i][:stopping]] 296 | return np.mean(stopped_marginals, 0) 297 | 298 | def error(mem): 299 | 300 | if len(mem) < 10: # why it uses 100? for the number of train size is equal to 100? 301 | return 1.0 302 | all_vals = (np.cumsum(mem, 0)/np.reshape(np.arange(1, len(mem)+1), (-1,1)))[-100:] 303 | errors = np.mean(np.abs(all_vals[-100:] - all_vals[-1:])/(np.abs(all_vals[-1:]) + 1e-12), -1) # confirm the variation of the shapley value converge towards to a small number(error) 304 | return np.max(errors) 305 | 306 | def my_accuracy_score(clf, X, y): 307 | 308 | probs = clf.predict_proba(X) 309 | predictions = np.argmax(probs, -1) 310 | return np.mean(np.equal(predictions, y)) 311 | 312 | def my_f1_score(clf, X, y): 313 | 314 | predictions = clf.predict(x) 315 | if len(set(y)) == 2: 316 | return f1_score(y, predictions) 317 | return f1_score(y, predictions, average='macro') 318 | 319 | def my_auc_score(clf, X, y): 320 | 321 | probs = clf.predict_proba(X) 322 | true_probs = probs[np.arange(len(y)), y] 323 | return roc_auc_score(y, true_probs) 324 | 325 | def my_xe_score(clf, X, y): 326 | 327 | probs = clf.predict_proba(X) 328 | true_probs = probs[np.arange(len(y)), y] 329 | true_log_probs = np.log(np.clip(true_probs, 1e-12, None)) 330 | return np.mean(true_log_probs) 331 | -------------------------------------------------------------------------------- /reproduction/YFCC100M/testlsh.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import bz2\n", 10 | "import numpy as np\n", 11 | "from tqdm import tqdm_notebook as tqdm\n", 12 | "import gzip\n", 13 | "from heapq import heappushpop\n", 14 | "from joblib import Parallel, delayed\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import joblib\n", 25 | "x_trn_hash = joblib.load('10M/eps0.1/x_trn_hash.pkl')\n", 26 | "w = joblib.load('10M/eps0.1/w.pkl')\n", 27 | "b = joblib.load('10M/eps0.1/b.pkl')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "y_trn = joblib.load('y_trn.pkl')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "346.42120146751404\n", 49 | "336.7830514907837\n", 50 | "335.7016701698303\n", 51 | "307.22366166114807\n", 52 | "290.6147196292877\n", 53 | "315.5828514099121\n", 54 | "344.08966970443726\n", 55 | "337.0721056461334\n", 56 | "360.9251072406769\n", 57 | "421.92287850379944\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "dataset_ids = []\n", 63 | "dataset_vals = []\n", 64 | "for data_id in range(10):\n", 65 | " st = time.time()\n", 66 | " dataset_val = np.load('x_trn_' + str(data_id) + '.npy')\n", 67 | " dataset_vals.append(dataset_val)\n", 68 | " print(time.time() - st)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 6, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "class X:\n", 78 | " def __init__(self, data, offset):\n", 79 | " self.data = data\n", 80 | " self.offset = offset\n", 81 | " \n", 82 | " def __getitem__(self, key):\n", 83 | " index1 = (key + self.offset) // 1000000\n", 84 | " index2 = (key + self.offset) % 1000000\n", 85 | " return self.data[index1][index2]\n", 86 | " \n", 87 | " def __len__(self):\n", 88 | " l = 0\n", 89 | " for x in self.data:\n", 90 | " l += len(x)\n", 91 | " return l - self.offset" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "x_trn = X(dataset_vals, 1100)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 11, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "y_tst = np.load('y_tst.npy')\n", 110 | "y_val = np.load('y_val.npy')\n", 111 | "x_tst = np.load('x_tst.npy')\n", 112 | "x_val = np.load('x_val.npy')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 12, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "x_val2 = x_val[:100]\n", 122 | "y_val2 = y_val[:100]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 13, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def val_error(K, sp_gt):\n", 132 | " K_star = 10\n", 133 | " start = time.time()\n", 134 | " x_val_knn_approx, nns_vec = lsh.get_approx_KNN(x_val2, K_star)\n", 135 | " runtime_query = time.time() - start\n", 136 | " print(runtime_query)\n", 137 | " \n", 138 | " start = time.time()\n", 139 | " sp_approx = lsh.compute_approx_shapley(x_val_knn_approx, y_val2, K)\n", 140 | " runtime_approx_value = time.time() - start\n", 141 | " print('it takes %s to get appox knn value' % runtime_approx_value)\n", 142 | " \n", 143 | " sp_err_inf_val= np.linalg.norm(sp_gt - sp_approx,ord=np.inf, axis=1)\n", 144 | " print('max error %s'% np.percentile(sp_err_inf_val,90))\n", 145 | " return sp_approx" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 57, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "def test_error(K, sp_gt):\n", 155 | " K_star = 10\n", 156 | " start = time.time()\n", 157 | " x_tst_knn_approx, nns_vec = lsh.get_approx_KNN(x_tst, K_star)\n", 158 | " runtime_query = time.time() - start\n", 159 | " print(runtime_query)\n", 160 | " \n", 161 | " start = time.time()\n", 162 | " sp_approx = lsh.compute_approx_shapley(x_tst_knn_approx, y_tst, K)\n", 163 | " runtime_approx_value = time.time() - start\n", 164 | " print('it takes %s to get appox knn value' % runtime_approx_value)\n", 165 | " \n", 166 | " sp_err_inf_val= np.linalg.norm(sp_gt - sp_approx,ord=np.inf, axis=1)\n", 167 | " print('max error %s'% np.percentile(sp_err_inf_val,90))\n", 168 | " return sp_approx" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 15, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "dist_rand = np.load('10M/eps0.1/dist_rand.npy')\n", 178 | "dist_rand = np.mean(dist_rand, axis=0)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 16, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "sp_gt2 = np.load('10M/eps0.1/sp_gt2.npy')" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 56, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "def equal(a, b):\n", 197 | " try:\n", 198 | " return not set.isdisjoint(a, b)\n", 199 | " except KeyError:\n", 200 | " return 0" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 52, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "import numpy as np\n", 210 | "import pdb\n", 211 | "\n", 212 | "\n", 213 | "def lsh_function(t,x,w,b):\n", 214 | " # x is 1-d array\n", 215 | " h = np.floor((np.dot(w,x)+b)/t).astype(int)\n", 216 | " return h\n", 217 | "\n", 218 | "\n", 219 | "class LSH:\n", 220 | " def __init__(self,n_hash_bit,n_hash_table,x_trn,y_trn,t=0.1):\n", 221 | " self.n_hash_bit = n_hash_bit\n", 222 | " self.n_hash_table = n_hash_table\n", 223 | " self.t = t # width of projections\n", 224 | " self.x_trn = x_trn\n", 225 | " self.y_trn = y_trn\n", 226 | " self.N = len(x_trn)\n", 227 | " self.dim = 4096\n", 228 | " # draw w from a normal distribution (2-stable)\n", 229 | " self.w = np.random.normal(0, 1, (n_hash_table, n_hash_bit, self.dim))\n", 230 | " # draw b from U[0,t]\n", 231 | " self.b = np.random.uniform(0, self.t, (n_hash_table, n_hash_bit))\n", 232 | " self.x_trn_hash = [dict() for i in range(n_hash_table)]\n", 233 | "# for i in tqdm(range(self.N)):\n", 234 | "# hash_code_all = lsh_function(self.t, x_trn[i] / dist_rand, self.w, self.b)\n", 235 | "# for l in range(n_hash_table):\n", 236 | "# hash_code_trn = '.'.join(map(str, hash_code_all[l, :]))\n", 237 | "# if hash_code_trn in self.x_trn_hash[l].keys():\n", 238 | "# self.x_trn_hash[l][hash_code_trn].append(i)\n", 239 | "# else:\n", 240 | "# self.x_trn_hash[l][hash_code_trn] = [i]\n", 241 | "# if i % 1000 == 0:\n", 242 | "# print('build hash %s'%i)\n", 243 | "\n", 244 | " def get_approx_KNN(self,x_tst,K):\n", 245 | " N_tst = x_tst.shape[0]\n", 246 | " x_tst_knn = np.ones((N_tst, K)) * (-1)\n", 247 | " nns_len = np.zeros(N_tst)\n", 248 | " for i_tst in tqdm(range(N_tst)):\n", 249 | " nns = np.array([])\n", 250 | " for l in range(self.n_hash_table):\n", 251 | " hash_code_int = lsh_function(self.t, x_tst[i_tst] / dist_rand, self.w[l, :, :], self.b[l, :])\n", 252 | " hash_code_test = '.'.join(map(str, hash_code_int))\n", 253 | " if hash_code_test in self.x_trn_hash[l].keys():\n", 254 | " nns = np.append(nns, self.x_trn_hash[l][hash_code_test])\n", 255 | " nns = np.unique(nns)\n", 256 | " nns = nns.astype(int)\n", 257 | " num_collide_elements = len(nns)\n", 258 | " if len(nns) > 0:\n", 259 | " dist = [np.linalg.norm(self.x_trn[i] / dist_rand - x_tst[i_tst] / dist_rand, 2) for i in nns]\n", 260 | " dist_min_ind = nns[np.argsort(dist)]\n", 261 | " if num_collide_elements < K:\n", 262 | " x_tst_knn[i_tst, :num_collide_elements] = dist_min_ind[:num_collide_elements]\n", 263 | " else:\n", 264 | " x_tst_knn[i_tst, :] = dist_min_ind[:K]\n", 265 | " # pdb.set_trace()\n", 266 | " nns_len[i_tst] = len(nns)\n", 267 | " if i_tst % 100 == 0:\n", 268 | " print('get approximate knn %s'%i_tst)\n", 269 | " return x_tst_knn.astype(int),nns_len\n", 270 | "\n", 271 | "\n", 272 | " def compute_approx_shapley(self,x_tst_knn,y_tst,K):\n", 273 | " N_tst,K_star = x_tst_knn.shape\n", 274 | " # flag_sufficient = (x_tst_knn[:,-1]>=0)\n", 275 | " sp_approx = np.zeros((N_tst,self.N))\n", 276 | " for j in tqdm(range(N_tst)):\n", 277 | " non_nan_index = np.where(x_tst_knn[j,:]>=0)[0]\n", 278 | " if len(non_nan_index)== 0:\n", 279 | " continue\n", 280 | " K_tot = non_nan_index[-1]\n", 281 | " if K_tot == self.N:\n", 282 | " sp_approx[j, x_tst_knn[j, self.N - 1]] = equal(self.y_trn[x_tst_knn[j, self.N - 1]], y_tst[j]) / self.N\n", 283 | " for i in np.arange(K_tot - 1, -1, -1):\n", 284 | " sp_approx[j, x_tst_knn[j, i]] = sp_approx[j, x_tst_knn[j, i+1]] + (\n", 285 | " equal(self.y_trn[x_tst_knn[j, i]], y_tst[j]) - equal(\n", 286 | " self.y_trn[x_tst_knn[j, i + 1]], y_tst[j])) / K * min([K, i + 1]) / (i + 1)\n", 287 | "\n", 288 | "\n", 289 | "\n", 290 | " return sp_approx" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 53, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "lsh = LSH(14,75,x_trn,y_trn,t=2.203)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 61, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "lsh.x_trn_hash = x_trn_hash\n", 309 | "lsh.w = w\n", 310 | "lsh.b = b" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 63, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "application/vnd.jupyter.widget-view+json": { 321 | "model_id": "e141ac778f1544ab8cc30572bf85e94d", 322 | "version_major": 2, 323 | "version_minor": 0 324 | }, 325 | "text/plain": [ 326 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 327 | ] 328 | }, 329 | "metadata": {}, 330 | "output_type": "display_data" 331 | }, 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "get approximate knn 0\n", 337 | "\n", 338 | "3972.0105855464935\n" 339 | ] 340 | }, 341 | { 342 | "data": { 343 | "application/vnd.jupyter.widget-view+json": { 344 | "model_id": "84515c129b3047b0bc94a3610e579a00", 345 | "version_major": 2, 346 | "version_minor": 0 347 | }, 348 | "text/plain": [ 349 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 350 | ] 351 | }, 352 | "metadata": {}, 353 | "output_type": "display_data" 354 | }, 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "\n", 360 | "it takes 0.09952259063720703 to get appox knn value\n", 361 | "max error 0.09141423452978795\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "sp_gt2_approx = test_error(2, sp_gt2)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 59, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "sp_gt = np.load('10M/eps0.1/sp_gt.npy')" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 62, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "application/vnd.jupyter.widget-view+json": { 386 | "model_id": "a2c40f5e121742eea136f2c9206e6784", 387 | "version_major": 2, 388 | "version_minor": 0 389 | }, 390 | "text/plain": [ 391 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 392 | ] 393 | }, 394 | "metadata": {}, 395 | "output_type": "display_data" 396 | }, 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "get approximate knn 0\n", 402 | "\n", 403 | "4183.600741863251\n" 404 | ] 405 | }, 406 | { 407 | "data": { 408 | "application/vnd.jupyter.widget-view+json": { 409 | "model_id": "d3c8e49679f1428a9e754b1c87846b58", 410 | "version_major": 2, 411 | "version_minor": 0 412 | }, 413 | "text/plain": [ 414 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 415 | ] 416 | }, 417 | "metadata": {}, 418 | "output_type": "display_data" 419 | }, 420 | { 421 | "name": "stdout", 422 | "output_type": "stream", 423 | "text": [ 424 | "\n", 425 | "it takes 0.07110762596130371 to get appox knn value\n", 426 | "max error 0.09141423452978795\n" 427 | ] 428 | }, 429 | { 430 | "data": { 431 | "text/plain": [ 432 | "array([[0., 0., 0., ..., 0., 0., 0.],\n", 433 | " [0., 0., 0., ..., 0., 0., 0.],\n", 434 | " [0., 0., 0., ..., 0., 0., 0.],\n", 435 | " ...,\n", 436 | " [0., 0., 0., ..., 0., 0., 0.],\n", 437 | " [0., 0., 0., ..., 0., 0., 0.],\n", 438 | " [0., 0., 0., ..., 0., 0., 0.]])" 439 | ] 440 | }, 441 | "execution_count": 62, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "test_error(1, sp_gt)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 29, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "for i in range(75):\n", 457 | " assert sum([len(v) for k, v in lsh.x_trn_hash[i].items()]) == 9998900 " 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 30, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "name": "stdout", 467 | "output_type": "stream", 468 | "text": [ 469 | "-1.0.0.0.0.0.0.-1.-1.-2.0.0.0.1 [ 325606 3817062 8955573 9021150]\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "for k, v in lsh.x_trn_hash[0].items():\n", 475 | " print(k, v)\n", 476 | " break" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 64, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "del sp_gt" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 65, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "sp_gt5 = np.load('10M/eps0.1/sp_gt5.npy')" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 66, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "application/vnd.jupyter.widget-view+json": { 505 | "model_id": "1a540a12f8104bf3a2736d83ddb6f911", 506 | "version_major": 2, 507 | "version_minor": 0 508 | }, 509 | "text/plain": [ 510 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 511 | ] 512 | }, 513 | "metadata": {}, 514 | "output_type": "display_data" 515 | }, 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | "get approximate knn 0\n", 521 | "\n", 522 | "3920.30832695961\n" 523 | ] 524 | }, 525 | { 526 | "data": { 527 | "application/vnd.jupyter.widget-view+json": { 528 | "model_id": "00e25da22dca48f1979c248d99fb9249", 529 | "version_major": 2, 530 | "version_minor": 0 531 | }, 532 | "text/plain": [ 533 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 534 | ] 535 | }, 536 | "metadata": {}, 537 | "output_type": "display_data" 538 | }, 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "\n", 544 | "it takes 0.06524181365966797 to get appox knn value\n", 545 | "max error 0.09057708217212376\n" 546 | ] 547 | }, 548 | { 549 | "data": { 550 | "text/plain": [ 551 | "array([[0., 0., 0., ..., 0., 0., 0.],\n", 552 | " [0., 0., 0., ..., 0., 0., 0.],\n", 553 | " [0., 0., 0., ..., 0., 0., 0.],\n", 554 | " ...,\n", 555 | " [0., 0., 0., ..., 0., 0., 0.],\n", 556 | " [0., 0., 0., ..., 0., 0., 0.],\n", 557 | " [0., 0., 0., ..., 0., 0., 0.]])" 558 | ] 559 | }, 560 | "execution_count": 66, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "test_error(5, sp_gt5)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 1, 572 | "metadata": {}, 573 | "outputs": [ 574 | { 575 | "data": { 576 | "text/plain": [ 577 | "2" 578 | ] 579 | }, 580 | "execution_count": 1, 581 | "metadata": {}, 582 | "output_type": "execute_result" 583 | } 584 | ], 585 | "source": [] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [] 593 | } 594 | ], 595 | "metadata": { 596 | "kernelspec": { 597 | "display_name": "Python 3", 598 | "language": "python", 599 | "name": "python3" 600 | }, 601 | "language_info": { 602 | "codemirror_mode": { 603 | "name": "ipython", 604 | "version": 3 605 | }, 606 | "file_extension": ".py", 607 | "mimetype": "text/x-python", 608 | "name": "python", 609 | "nbconvert_exporter": "python", 610 | "pygments_lexer": "ipython3", 611 | "version": "3.7.0" 612 | } 613 | }, 614 | "nbformat": 4, 615 | "nbformat_minor": 2 616 | } 617 | -------------------------------------------------------------------------------- /reproduction/ImageNet/accuracy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import bz2\n", 10 | "import numpy as np\n", 11 | "from tqdm import tqdm_notebook as tqdm\n", 12 | "import gzip\n", 13 | "from heapq import heappushpop\n", 14 | "from joblib import Parallel, delayed\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 4, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import os\n", 26 | "\n", 27 | "classes = []\n", 28 | "\n", 29 | "for root, dirs, files in os.walk(\".\"): \n", 30 | " for filename in files:\n", 31 | " classes.append(filename)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 5, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from re import compile\n", 41 | "\n", 42 | "rex = compile('n[0-9]+')\n", 43 | "classes = [x for x in classes if rex.match(x)]" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 6, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "application/vnd.jupyter.widget-view+json": { 54 | "model_id": "e6a4ec53f8b846c8b1f2f1363560bd8e", 55 | "version_major": 2, 56 | "version_minor": 0 57 | }, 58 | "text/plain": [ 59 | "HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))" 60 | ] 61 | }, 62 | "metadata": {}, 63 | "output_type": "display_data" 64 | }, 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "x_trn = []\n", 75 | "y_trn = []\n", 76 | "for c in tqdm(classes):\n", 77 | " x_trn.append(np.load(c))\n", 78 | " y_trn += [c] * len(x_trn[-1])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 7, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "997659\n", 91 | "997659\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "x_trn = np.vstack(x_trn)\n", 97 | "print(len(x_trn))\n", 98 | "print(len(y_trn))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "from sklearn.utils import shuffle\n", 108 | "x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "x_trn = np.reshape(x_trn, (-1, 2048))\n", 118 | "x_tst, y_tst = x_trn[:100], y_trn[:100]\n", 119 | "x_val, y_val = x_trn[100:1100], y_trn[100:1100]\n", 120 | "x_trn, y_trn = x_trn[1100:], y_trn[1100:]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 132 | " metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n", 133 | " weights='uniform')" 134 | ] 135 | }, 136 | "execution_count": 9, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "from sklearn.neighbors import KNeighborsClassifier\n", 143 | "neigh = KNeighborsClassifier(n_neighbors=1)\n", 144 | "neigh.fit(x_trn, y_trn)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 10, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "0.8" 156 | ] 157 | }, 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "neigh.score(x_tst, y_tst)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "0.681" 176 | ] 177 | }, 178 | "execution_count": 11, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "neigh.score(x_val, y_val)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 8, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.linear_model import LogisticRegression\n", 194 | "clf = LogisticRegression(random_state=0, solver='lbfgs',\n", 195 | " multi_class='multinomial', n_jobs=-1).fit(x_trn, y_trn)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 9, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "0.9" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "clf.score(x_tst, y_tst)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "0.81" 227 | ] 228 | }, 229 | "execution_count": 10, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "clf.score(x_val, y_val)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 11, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "0.9549760726660439" 247 | ] 248 | }, 249 | "execution_count": 11, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "clf.score(x_trn, y_trn)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 12, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "1000" 267 | ] 268 | }, 269 | "execution_count": 12, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "len(classes)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 14, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "classes2idx = {}\n", 285 | "for i, x in enumerate(classes):\n", 286 | " classes2idx[x] = i" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 15, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "y_trn_idx = [classes2idx[x] for x in y_trn]" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 17, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "Epoch 1/5\n", 308 | "996559/996559 [==============================] - 134s 135us/step - loss: 1.2208 - acc: 0.7336\n", 309 | "Epoch 2/5\n", 310 | "996559/996559 [==============================] - 123s 123us/step - loss: 0.6757 - acc: 0.8191\n", 311 | "Epoch 3/5\n", 312 | "996559/996559 [==============================] - 122s 122us/step - loss: 0.5928 - acc: 0.8383\n", 313 | "Epoch 4/5\n", 314 | "996559/996559 [==============================] - 125s 125us/step - loss: 0.5440 - acc: 0.8506\n", 315 | "Epoch 5/5\n", 316 | "996559/996559 [==============================] - 122s 122us/step - loss: 0.5092 - acc: 0.8597\n" 317 | ] 318 | }, 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "" 323 | ] 324 | }, 325 | "execution_count": 17, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "import os\n", 332 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 333 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"6\"\n", 334 | "\n", 335 | "from keras.models import Sequential\n", 336 | "from keras.layers import Dense, Activation\n", 337 | "\n", 338 | "model = Sequential()\n", 339 | "model.add(Dense(len(classes), input_dim=2048))\n", 340 | "model.add(Activation('softmax'))\n", 341 | "\n", 342 | "model.compile(loss='sparse_categorical_crossentropy',\n", 343 | " optimizer='sgd',\n", 344 | " metrics=['accuracy'])\n", 345 | "\n", 346 | "model.fit(x_trn, y_trn_idx, epochs=5, batch_size=32)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 18, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "y_tst_idx = [classes2idx[x] for x in y_tst]\n", 356 | "y_val_idx = [classes2idx[x] for x in y_val]" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 19, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "\r", 369 | "100/100 [==============================] - 0s 384us/step\n" 370 | ] 371 | }, 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "[0.3620190918445587, 0.9100000262260437]" 376 | ] 377 | }, 378 | "execution_count": 19, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "model.evaluate(x_tst, y_tst_idx, batch_size=128)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 20, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "1000/1000 [==============================] - 0s 35us/step\n" 397 | ] 398 | }, 399 | { 400 | "data": { 401 | "text/plain": [ 402 | "[0.6134206886291504, 0.8280000019073487]" 403 | ] 404 | }, 405 | "execution_count": 20, 406 | "metadata": {}, 407 | "output_type": "execute_result" 408 | } 409 | ], 410 | "source": [ 411 | "model.evaluate(x_val, y_val_idx, batch_size=128)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 22, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "Epoch 1/5\n", 424 | "996559/996559 [==============================] - 122s 123us/step - loss: 0.4698 - acc: 0.8701\n", 425 | "Epoch 2/5\n", 426 | "996559/996559 [==============================] - 123s 124us/step - loss: 0.4491 - acc: 0.8758\n", 427 | "Epoch 3/5\n", 428 | "996559/996559 [==============================] - 121s 122us/step - loss: 0.4316 - acc: 0.8806\n", 429 | "Epoch 4/5\n", 430 | "996559/996559 [==============================] - 121s 121us/step - loss: 0.4163 - acc: 0.8849\n", 431 | "Epoch 5/5\n", 432 | "996559/996559 [==============================] - 121s 122us/step - loss: 0.4026 - acc: 0.8889\n" 433 | ] 434 | }, 435 | { 436 | "data": { 437 | "text/plain": [ 438 | "" 439 | ] 440 | }, 441 | "execution_count": 22, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "model.fit(x_trn, y_trn_idx, epochs=5, batch_size=32)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 23, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "\r", 460 | "100/100 [==============================] - 0s 33us/step\n" 461 | ] 462 | }, 463 | { 464 | "data": { 465 | "text/plain": [ 466 | "[0.33740347623825073, 0.9100000262260437]" 467 | ] 468 | }, 469 | "execution_count": 23, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "model.evaluate(x_tst, y_tst_idx, batch_size=128)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 24, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "1000/1000 [==============================] - 0s 42us/step\n" 488 | ] 489 | }, 490 | { 491 | "data": { 492 | "text/plain": [ 493 | "[0.5951442904472352, 0.8239999980926513]" 494 | ] 495 | }, 496 | "execution_count": 24, 497 | "metadata": {}, 498 | "output_type": "execute_result" 499 | } 500 | ], 501 | "source": [ 502 | "model.evaluate(x_val, y_val_idx, batch_size=128)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 25, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "Epoch 1/5\n", 515 | "996559/996559 [==============================] - 126s 126us/step - loss: 0.3904 - acc: 0.8924\n", 516 | "Epoch 2/5\n", 517 | "996559/996559 [==============================] - 123s 123us/step - loss: 0.3790 - acc: 0.8960\n", 518 | "Epoch 3/5\n", 519 | "996559/996559 [==============================] - 115s 116us/step - loss: 0.3687 - acc: 0.8989\n", 520 | "Epoch 4/5\n", 521 | "996559/996559 [==============================] - 115s 116us/step - loss: 0.3591 - acc: 0.9015\n", 522 | "Epoch 5/5\n", 523 | "996559/996559 [==============================] - 124s 124us/step - loss: 0.3503 - acc: 0.9043\n" 524 | ] 525 | }, 526 | { 527 | "data": { 528 | "text/plain": [ 529 | "" 530 | ] 531 | }, 532 | "execution_count": 25, 533 | "metadata": {}, 534 | "output_type": "execute_result" 535 | } 536 | ], 537 | "source": [ 538 | "model.fit(x_trn, y_trn_idx, epochs=5, batch_size=32)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 26, 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "name": "stdout", 548 | "output_type": "stream", 549 | "text": [ 550 | "\r", 551 | "100/100 [==============================] - 0s 49us/step\n" 552 | ] 553 | }, 554 | { 555 | "data": { 556 | "text/plain": [ 557 | "[0.34398719668388367, 0.8999999761581421]" 558 | ] 559 | }, 560 | "execution_count": 26, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "model.evaluate(x_tst, y_tst_idx, batch_size=128)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 27, 572 | "metadata": {}, 573 | "outputs": [ 574 | { 575 | "name": "stdout", 576 | "output_type": "stream", 577 | "text": [ 578 | "1000/1000 [==============================] - 0s 30us/step\n" 579 | ] 580 | }, 581 | { 582 | "data": { 583 | "text/plain": [ 584 | "[0.5886992530822754, 0.8349999971389771]" 585 | ] 586 | }, 587 | "execution_count": 27, 588 | "metadata": {}, 589 | "output_type": "execute_result" 590 | } 591 | ], 592 | "source": [ 593 | "model.evaluate(x_val, y_val_idx, batch_size=128)" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 10, 599 | "metadata": {}, 600 | "outputs": [ 601 | { 602 | "data": { 603 | "text/plain": [ 604 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 605 | " metric_params=None, n_jobs=None, n_neighbors=2, p=2,\n", 606 | " weights='uniform')" 607 | ] 608 | }, 609 | "execution_count": 10, 610 | "metadata": {}, 611 | "output_type": "execute_result" 612 | } 613 | ], 614 | "source": [ 615 | "from sklearn.neighbors import KNeighborsClassifier\n", 616 | "neigh2 = KNeighborsClassifier(n_neighbors=2)\n", 617 | "neigh2.fit(x_trn, y_trn)" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 11, 623 | "metadata": {}, 624 | "outputs": [ 625 | { 626 | "data": { 627 | "text/plain": [ 628 | "0.81" 629 | ] 630 | }, 631 | "execution_count": 11, 632 | "metadata": {}, 633 | "output_type": "execute_result" 634 | } 635 | ], 636 | "source": [ 637 | "neigh2.score(x_tst, y_tst)" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 12, 643 | "metadata": {}, 644 | "outputs": [ 645 | { 646 | "data": { 647 | "text/plain": [ 648 | "0.67" 649 | ] 650 | }, 651 | "execution_count": 12, 652 | "metadata": {}, 653 | "output_type": "execute_result" 654 | } 655 | ], 656 | "source": [ 657 | "neigh2.score(x_val, y_val)" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 13, 663 | "metadata": {}, 664 | "outputs": [ 665 | { 666 | "data": { 667 | "text/plain": [ 668 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 669 | " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", 670 | " weights='uniform')" 671 | ] 672 | }, 673 | "execution_count": 13, 674 | "metadata": {}, 675 | "output_type": "execute_result" 676 | } 677 | ], 678 | "source": [ 679 | "from sklearn.neighbors import KNeighborsClassifier\n", 680 | "neigh5 = KNeighborsClassifier(n_neighbors=5)\n", 681 | "neigh5.fit(x_trn, y_trn)" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 14, 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "data": { 691 | "text/plain": [ 692 | "0.83" 693 | ] 694 | }, 695 | "execution_count": 14, 696 | "metadata": {}, 697 | "output_type": "execute_result" 698 | } 699 | ], 700 | "source": [ 701 | "neigh5.score(x_tst, y_tst)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 15, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "data": { 711 | "text/plain": [ 712 | "0.735" 713 | ] 714 | }, 715 | "execution_count": 15, 716 | "metadata": {}, 717 | "output_type": "execute_result" 718 | } 719 | ], 720 | "source": [ 721 | "neigh5.score(x_val, y_val)" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": null, 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [] 730 | } 731 | ], 732 | "metadata": { 733 | "kernelspec": { 734 | "display_name": "Python 3", 735 | "language": "python", 736 | "name": "python3" 737 | }, 738 | "language_info": { 739 | "codemirror_mode": { 740 | "name": "ipython", 741 | "version": 3 742 | }, 743 | "file_extension": ".py", 744 | "mimetype": "text/x-python", 745 | "name": "python", 746 | "nbconvert_exporter": "python", 747 | "pygments_lexer": "ipython3", 748 | "version": "3.5.2" 749 | } 750 | }, 751 | "nbformat": 4, 752 | "nbformat_minor": 2 753 | } 754 | -------------------------------------------------------------------------------- /use_case/DataAcquisition/Shapley.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import roc_auc_score, f1_score 6 | 7 | class ShapNN(object): 8 | 9 | def __init__(self, mode, hidden_units=[100], learning_rate=0.001, 10 | dropout = 0., activation=None, initializer=None, 11 | weight_decay=0.0001, optimizer='adam', batch_size=128, 12 | warm_start=False, max_epochs=100, validation_fraction=0.1, 13 | early_stopping=0, address=None, test_batch_size=1000, 14 | random_seed=666): 15 | 16 | self.mode = mode 17 | self.batch_size = batch_size 18 | self.test_batch_size = test_batch_size 19 | self.hidden_units = hidden_units 20 | self.initializer = initializer 21 | self.activation = activation 22 | self.dropout = dropout 23 | self.weight_decay = weight_decay 24 | self.optimizer = optimizer 25 | self.learning_rate = learning_rate 26 | self.warm_start = warm_start 27 | self.max_epochs = max_epochs 28 | self.early_stopping = early_stopping 29 | self.validation_fraction = validation_fraction 30 | self.address = address 31 | self._extra_train_ops = [] 32 | self.random_seed = random_seed 33 | self.is_built = False 34 | 35 | def prediction_cost(self, X_test, y_test, batch_size=None): 36 | 37 | if batch_size is None: 38 | batch_size = self.test_batch_size 39 | assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!' 40 | with self.graph.as_default(): 41 | losses = [] 42 | idxs = np.arange(len(X_test)) 43 | batches = [idxs[k * batch_size: (k+1) * batch_size] 44 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 45 | for batch in batches: 46 | losses.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch], 47 | self.labels:y_test[batch]})) 48 | return np.mean(losses) 49 | 50 | def score(self, X_test, y_test, batch_size=None): 51 | 52 | if batch_size is None: 53 | batch_size = self.test_batch_size 54 | assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!' 55 | with self.graph.as_default(): 56 | scores = [] 57 | idxs = np.arange(len(X_test)) 58 | batches = [idxs[k * batch_size: (k+1) * batch_size] 59 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 60 | for batch in batches: 61 | scores.append(self.sess.run(self.prediction_score, {self.input_ph:X_test[batch], 62 | self.labels:y_test[batch]})) 63 | return np.mean(scores) 64 | 65 | def predict_proba(self, X_test, batch_size=None): 66 | 67 | if batch_size is None: 68 | batch_size = self.test_batch_size 69 | with self.graph.as_default(): 70 | probs = [] 71 | idxs = np.arange(len(X_test)) 72 | batches = [idxs[k * batch_size: (k+1) * batch_size] 73 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 74 | for batch in batches: 75 | probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]})) 76 | return np.concatenate(probs, axis=0) 77 | 78 | def predict_log_proba(self, X_test, batch_size=None): 79 | 80 | if batch_size is None: 81 | batch_size = self.test_batch_size 82 | with self.graph.as_default(): 83 | probs = [] 84 | idxs = np.arange(len(X_test)) 85 | batches = [idxs[k * batch_size: (k+1) * batch_size] 86 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 87 | for batch in batches: 88 | probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]})) 89 | return np.log(np.clip(np.concatenate(probs), 1e-12, None)) 90 | 91 | def cost(self, X_test, y_test, batch_size=None): 92 | 93 | if batch_size is None: 94 | batch_size = self.batch_size 95 | with self.graph.as_default(): 96 | losss = [] 97 | idxs = np.arange(len(X_test)) 98 | batches = [idxs[k * batch_size: (k+1) * batch_size] 99 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 100 | for batch in batches: 101 | losss.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch], 102 | self.labels:y_test[batch]})) 103 | return np.mean(losss) 104 | 105 | def predict(self, X_test, batch_size=None): 106 | 107 | if batch_size is None: 108 | batch_size = self.batch_size 109 | with self.graph.as_default(): 110 | predictions = [] 111 | idxs = np.arange(len(X_test)) 112 | batches = [idxs[k * batch_size: (k+1) * batch_size] 113 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 114 | for batch in batches: 115 | predictions.append(self.sess.run(self.predictions, {self.input_ph:X_test[batch]})) 116 | return np.concatenate(predictions) 117 | 118 | def fit(self, X, y, X_val=None, y_val=None, sources=None, max_epochs=None, 119 | batch_size=None, save=False, load=False, sample_weight=None, 120 | metric='accuracy'): 121 | 122 | self.num_classes = len(set(y)) 123 | self.metric = metric 124 | if max_epochs is None: 125 | max_epochs = self.max_epochs 126 | if batch_size is None: 127 | batch_size = self.batch_size 128 | if not self.is_built: 129 | self.graph = tf.Graph() 130 | with self.graph.as_default(): 131 | config = tf.ConfigProto() 132 | config.gpu_options.allow_growth=True 133 | self.sess = tf.Session(config=config) 134 | with self.graph.as_default(): 135 | tf.set_random_seed(self.random_seed) 136 | try: 137 | self.global_step = tf.train.create_global_step() 138 | except ValueError: 139 | self.global_step = tf.train.get_global_step() 140 | if not self.is_built: 141 | self._build_model(X, y) 142 | self.saver = tf.train.Saver() 143 | self._initialize() 144 | if len(X): 145 | if X_val is None and self.validation_fraction * len(X) > 2: 146 | X_train, X_val, y_train, y_val, sample_weight, _ = train_test_split( 147 | X, y, sample_weight, test_size=self.validation_fraction) 148 | else: 149 | X_train, y_train = X, y 150 | self._train_model(X_train, y_train, X_val=X_val, y_val=y_val, 151 | max_epochs=max_epochs, batch_size=batch_size, 152 | sources=sources, sample_weight=sample_weight) 153 | if save and self.address is not None: 154 | self.saver.save(self.sess, self.address) 155 | 156 | def _train_model(self, X, y, X_val, y_val, max_epochs, batch_size, 157 | sources=None, sample_weight=None): 158 | 159 | 160 | assert len(X)==len(y), 'Input and labels not the same size' 161 | self.history = {'metrics':[], 'idxs':[]} 162 | stop_counter = 0 163 | best_performance = None 164 | for epoch in range(max_epochs): 165 | vals_metrics, idxs = self._one_epoch( 166 | X, y, X_val, y_val, batch_size, sources=sources, sample_weight=sample_weight) 167 | self.history['idxs'].append(idxs) 168 | self.history['metrics'].append(vals_metrics) 169 | if self.early_stopping and X_val is not None: 170 | current_performance = np.mean(val_acc) 171 | if best_performance is None: 172 | best_performance = current_performance 173 | if current_performance > best_performance: 174 | best_performance = current_performance 175 | stop_counter = 0 176 | else: 177 | stop_counter += 1 178 | if stop_counter > self.early_stopping: 179 | break 180 | 181 | def _one_epoch(self, X, y, X_val, y_val, batch_size, sources=None, sample_weight=None): 182 | 183 | vals = [] 184 | if sources is None: 185 | if sample_weight is None: 186 | idxs = np.random.permutation(len(X)) 187 | else: 188 | idxs = np.random.choice(len(X), len(X), p=sample_weight/np.sum(sample_weight)) 189 | batches = [idxs[k*batch_size:(k+1) * batch_size] 190 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 191 | idxs = batches 192 | else: 193 | idxs = np.random.permutation(len(sources.keys())) 194 | batches = [sources[i] for i in idxs] 195 | for batch_counter, batch in enumerate(batches): 196 | self.sess.run(self.train_op, 197 | {self.input_ph:X[batch], self.labels:y[batch], 198 | self.dropout_ph:self.dropout}) 199 | if X_val is not None: 200 | if self.metric=='accuracy': 201 | vals.append(self.score(X_val, y_val)) 202 | elif self.metric=='f1': 203 | vals.append(f1_score(y_val, self.predict(X_val))) 204 | elif self.metric=='auc': 205 | vals.append(roc_auc_score(y_val, self.predict_proba(X_val)[:,1])) 206 | elif self.metric=='xe': 207 | vals.append(-self.prediction_cost(X_val, y_val)) 208 | return np.array(vals), np.array(idxs) 209 | 210 | def _initialize(self): 211 | 212 | uninitialized_vars = [] 213 | if self.warm_start: 214 | for var in tf.global_variables(): 215 | try: 216 | self.sess.run(var) 217 | except tf.errors.FailedPreconditionError: 218 | uninitialized_vars.append(var) 219 | else: 220 | uninitialized_vars = tf.global_variables() 221 | self.sess.run(tf.initializers.variables(uninitialized_vars)) 222 | 223 | def _build_model(self, X, y): 224 | 225 | self.num_classes = len(set(y)) 226 | if self.initializer is None: 227 | initializer = tf.initializers.variance_scaling(distribution='uniform') 228 | if self.activation is None: 229 | activation = lambda x: tf.nn.relu(x) 230 | self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input') 231 | self.dropout_ph = tf.placeholder_with_default( 232 | tf.constant(0., dtype=tf.float32), shape=(), name='dropout') 233 | if self.mode=='regression': 234 | self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label') 235 | else: 236 | self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label') 237 | x = tf.reshape(self.input_ph, shape=(-1, np.prod(X.shape[1:]))) 238 | for layer, hidden_unit in enumerate(self.hidden_units): 239 | with tf.variable_scope('dense_{}'.format(layer)): 240 | x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 241 | initializer=self.initializer, activation=activation) 242 | with tf.variable_scope('final'): 243 | self.prelogits = x 244 | self._final_layer(self.prelogits, self.num_classes, self.mode) 245 | self._build_train_op() 246 | 247 | def _build_train_op(self): 248 | 249 | """Build taining specific ops for the graph.""" 250 | learning_rate = tf.constant(self.learning_rate, tf.float32) ##fixit 251 | trainable_variables = tf.trainable_variables() 252 | grads = tf.gradients(self.loss, trainable_variables) 253 | self.grad_flat = tf.concat([tf.reshape(grad, (-1, 1)) for grad in grads], axis=0) 254 | if self.optimizer == 'sgd': 255 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 256 | elif self.optimizer == 'mom': 257 | optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) 258 | elif self.optimizer == 'adam': 259 | optimizer = tf.train.AdamOptimizer(learning_rate) 260 | apply_op = optimizer.apply_gradients( 261 | zip(grads, trainable_variables), 262 | global_step=self.global_step, name='train_step') 263 | train_ops = [apply_op] + self._extra_train_ops + tf.get_collection(tf.GraphKeys.UPDATE_OPS) 264 | previous_ops = [tf.group(*train_ops)] 265 | with tf.control_dependencies(previous_ops): 266 | self.train_op = tf.no_op(name='train') 267 | self.is_built = True 268 | 269 | def _final_layer(self, x, num_classes, mode): 270 | 271 | if mode=='regression': 272 | self.logits = self._dense(x, 1, dropout=self.dropout_ph) 273 | self.predictions = tf.reduce_sum(self.logits, axis=-1) 274 | regression_loss = tf.nn.l2_loss(self.predictions - self.labels) ##FIXIT 275 | self.prediction_loss = tf.reduce_mean(regression_loss, name='l2') 276 | residuals = self.predictions - self.labels 277 | var_predicted = tf.reduce_mean(residuals**2) - tf.reduce_mean(residuals)**2 278 | var_labels = tf.reduce_mean(self.labels**2) - tf.reduce_mean(self.labels)**2 279 | self.prediction_score = 1 - var_predicted/(var_labels + 1e-12) 280 | else: 281 | self.logits = self._dense(x, num_classes, dropout=self.dropout_ph) 282 | self.probs = tf.nn.softmax(self.logits) 283 | xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 284 | logits=self.logits, labels=tf.cast(self.labels, tf.int32)) 285 | self.prediction_loss = tf.reduce_mean(xent_loss, name='xent') 286 | self.predictions = tf.argmax(self.probs, axis=-1, output_type=tf.int32) 287 | correct_predictions = tf.equal(self.predictions, self.labels) 288 | self.prediction_score = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) 289 | self.loss = self.prediction_loss + self._reg_loss() 290 | 291 | def _dense(self, x, out_dim, dropout=tf.constant(0.), initializer=None, activation=None): 292 | 293 | if initializer is None: 294 | initializer = tf.initializers.variance_scaling(distribution='uniform') 295 | w = tf.get_variable('DW', [x.get_shape()[1], out_dim], initializer=initializer) 296 | b = tf.get_variable('Db', [out_dim], initializer=tf.constant_initializer()) 297 | x = tf.nn.dropout(x, 1. - dropout) 298 | if activation: 299 | x = activation(x) 300 | return tf.nn.xw_plus_b(x, w, b) 301 | 302 | def _reg_loss(self, order=2): 303 | """Regularization loss for weight decay.""" 304 | losss = [] 305 | for var in tf.trainable_variables(): 306 | if var.op.name.find(r'DW') > 0 or var.op.name.find(r'CW') > 0: ##FIXIT 307 | if order==2: 308 | losss.append(tf.nn.l2_loss(var)) 309 | elif order==1: 310 | losss.append(tf.abs(var)) 311 | else: 312 | raise ValueError("Invalid regularization order!") 313 | return tf.multiply(self.weight_decay, tf.add_n(losss)) 314 | 315 | 316 | class CShapNN(ShapNN): 317 | 318 | def __init__(self, mode, hidden_units=[100], kernel_sizes=[], 319 | strides=None, channels=[], learning_rate=0.001, 320 | dropout = 0., activation=None, initializer=None, global_averaging=False, 321 | weight_decay=0.0001, optimizer='adam', batch_size=128, 322 | warm_start=False, max_epochs=100, validation_fraction=0.1, 323 | early_stopping=0, address=None, test_batch_size=1000, random_seed=666): 324 | 325 | self.mode = mode 326 | self.test_batch_size = test_batch_size 327 | self.kernels = []#FIXIT 328 | self.kernel_sizes = kernel_sizes 329 | self.channels = channels 330 | self.global_averaging = global_averaging 331 | assert len(channels)==len(kernel_sizes), 'Invalid channels or kernel_sizes' 332 | if strides is None: 333 | self.strides = [1] * len(kernel_sizes) 334 | else: 335 | self.strides = strides 336 | self.batch_size = batch_size 337 | self.hidden_units = hidden_units 338 | self.initializer = initializer 339 | self.activation = activation 340 | self.dropout = dropout 341 | self.weight_decay = weight_decay 342 | self.optimizer = optimizer 343 | self.learning_rate = learning_rate 344 | self.warm_start = warm_start 345 | self.max_epochs = max_epochs 346 | self.early_stopping = early_stopping 347 | self.validation_fraction = validation_fraction 348 | self.address = address 349 | self._extra_train_ops = [] 350 | self.random_seed = random_seed 351 | self.graph = tf.Graph() 352 | self.is_built = False 353 | with self.graph.as_default(): 354 | config = tf.ConfigProto() 355 | config.gpu_options.allow_growth=True 356 | self.sess = tf.Session(config=config) 357 | 358 | def _conv(self, x, filter_size, out_filters, strides, activation=None): 359 | 360 | in_filters = int(x.get_shape()[-1]) 361 | n = filter_size * filter_size * out_filters 362 | kernel = tf.get_variable( 363 | 'DW', [filter_size, filter_size, in_filters, out_filters], 364 | tf.float32, initializer=tf.random_normal_initializer( 365 | stddev=np.sqrt(2.0/n))) 366 | self.kernels.append(kernel) 367 | x = tf.nn.conv2d(x, kernel, strides, padding='SAME') 368 | if activation: 369 | x = activation(x) 370 | return x 371 | 372 | def _stride_arr(self, stride): 373 | 374 | if isinstance(stride, int): 375 | return [1, stride, stride, 1] 376 | if len(stride)==2: 377 | return [1, stride[0], stride[1], 1] 378 | if len(stride)==4: 379 | return stride 380 | raise ValueError('Invalid value!') 381 | 382 | def _build_model(self, X, y): 383 | 384 | 385 | if self.initializer is None: 386 | initializer = tf.initializers.variance_scaling(distribution='uniform') 387 | if self.activation is None: 388 | activation = lambda x: tf.nn.relu(x) 389 | self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input') 390 | self.dropout_ph = tf.placeholder_with_default( 391 | tf.constant(0., dtype=tf.float32), shape=(), name='dropout') 392 | if self.mode=='regression': 393 | self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label') 394 | else: 395 | self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label') 396 | if len(X.shape[1:]) == 2: 397 | x = tf.reshape(self.input_ph, [-1, X.shape[0], X.shape[1], 1]) 398 | else: 399 | x = self.input_ph 400 | for layer, (kernel_size, channels, stride) in enumerate(zip( 401 | self.kernel_sizes, self.channels, self.strides)): 402 | with tf.variable_scope('conv_{}'.format(layer)): 403 | x = self._conv(x, kernel_size, channels, self._stride_arr(stride), activation=activation) 404 | if self.global_averaging: 405 | x = tf.reduce_mean(x, axis=(1,2)) 406 | else: 407 | x = tf.reshape(x, shape=(-1, np.prod(x.get_shape()[1:]))) 408 | for layer, hidden_unit in enumerate(self.hidden_units): 409 | with tf.variable_scope('dense_{}'.format(layer)): 410 | x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 411 | initializer=self.initializer, activation=activation) 412 | 413 | with tf.variable_scope('final'): 414 | self.prelogits = x 415 | self._final_layer(self.prelogits, len(set(y)), self.mode) 416 | self._build_train_op() 417 | -------------------------------------------------------------------------------- /use_case/Noisy Label, Watermarking/Shapley.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import roc_auc_score, f1_score 6 | 7 | class ShapNN(object): 8 | 9 | def __init__(self, mode, hidden_units=[100], learning_rate=0.001, 10 | dropout = 0., activation=None, initializer=None, 11 | weight_decay=0.0001, optimizer='adam', batch_size=128, 12 | warm_start=False, max_epochs=100, validation_fraction=0.1, 13 | early_stopping=0, address=None, test_batch_size=1000, 14 | random_seed=666, num_classes=10): 15 | 16 | self.mode = mode 17 | self.batch_size = batch_size 18 | self.test_batch_size = test_batch_size 19 | self.hidden_units = hidden_units 20 | self.initializer = initializer 21 | self.activation = activation 22 | self.dropout = dropout 23 | self.weight_decay = weight_decay 24 | self.optimizer = optimizer 25 | self.learning_rate = learning_rate 26 | self.warm_start = warm_start 27 | self.max_epochs = max_epochs 28 | self.early_stopping = early_stopping 29 | self.validation_fraction = validation_fraction 30 | self.address = address 31 | self._extra_train_ops = [] 32 | self.random_seed = random_seed 33 | self.is_built = False 34 | self.num_classes = num_classes 35 | 36 | def prediction_cost(self, X_test, y_test, batch_size=None): 37 | 38 | if batch_size is None: 39 | batch_size = self.test_batch_size 40 | # assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!' 41 | with self.graph.as_default(): 42 | losses = [] 43 | idxs = np.arange(len(X_test)) 44 | batches = [idxs[k * batch_size: (k+1) * batch_size] 45 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 46 | for batch in batches: 47 | losses.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch], 48 | self.labels:y_test[batch]})) 49 | return np.mean(losses) 50 | 51 | def score(self, X_test, y_test, batch_size=None): 52 | 53 | if batch_size is None: 54 | batch_size = self.test_batch_size 55 | # assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!' 56 | with self.graph.as_default(): 57 | scores = [] 58 | idxs = np.arange(len(X_test)) 59 | batches = [idxs[k * batch_size: (k+1) * batch_size] 60 | for k in range(int(np.ceil(1.0 * len(idxs)/batch_size)))] 61 | for batch in batches: 62 | scores.append(self.sess.run(self.prediction_score, {self.input_ph:X_test[batch], 63 | self.labels:y_test[batch]})) 64 | return np.mean(scores) 65 | 66 | def predict_proba(self, X_test, batch_size=None): 67 | 68 | if batch_size is None: 69 | batch_size = self.test_batch_size 70 | with self.graph.as_default(): 71 | probs = [] 72 | idxs = np.arange(len(X_test)) 73 | batches = [idxs[k * batch_size: (k+1) * batch_size] 74 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 75 | for batch in batches: 76 | probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]})) 77 | return np.concatenate(probs, axis=0) 78 | 79 | def predict_log_proba(self, X_test, batch_size=None): 80 | 81 | if batch_size is None: 82 | batch_size = self.test_batch_size 83 | with self.graph.as_default(): 84 | probs = [] 85 | idxs = np.arange(len(X_test)) 86 | batches = [idxs[k * batch_size: (k+1) * batch_size] 87 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 88 | for batch in batches: 89 | probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]})) 90 | return np.log(np.clip(np.concatenate(probs), 1e-12, None)) 91 | 92 | def cost(self, X_test, y_test, batch_size=None): 93 | 94 | if batch_size is None: 95 | batch_size = self.batch_size 96 | with self.graph.as_default(): 97 | losss = [] 98 | idxs = np.arange(len(X_test)) 99 | batches = [idxs[k * batch_size: (k+1) * batch_size] 100 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 101 | for batch in batches: 102 | losss.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch], 103 | self.labels:y_test[batch]})) 104 | return np.mean(losss) 105 | 106 | def predict(self, X_test, batch_size=None): 107 | 108 | if batch_size is None: 109 | batch_size = self.batch_size 110 | with self.graph.as_default(): 111 | predictions = [] 112 | idxs = np.arange(len(X_test)) 113 | batches = [idxs[k * batch_size: (k+1) * batch_size] 114 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 115 | for batch in batches: 116 | predictions.append(self.sess.run(self.predictions, {self.input_ph:X_test[batch]})) 117 | return np.concatenate(predictions) 118 | 119 | def fit(self, X, y, X_val=None, y_val=None, sources=None, max_epochs=None, 120 | batch_size=None, save=False, load=False, sample_weight=None, 121 | metric='accuracy'): 122 | 123 | # self.num_classes = len(set(y_val)) 124 | self.metric = metric 125 | if max_epochs is None: 126 | max_epochs = self.max_epochs 127 | if batch_size is None: 128 | batch_size = self.batch_size 129 | if not self.is_built: 130 | self.graph = tf.Graph() 131 | with self.graph.as_default(): 132 | config = tf.ConfigProto() 133 | config.gpu_options.allow_growth=True 134 | self.sess = tf.Session(config=config) 135 | with self.graph.as_default(): 136 | tf.set_random_seed(self.random_seed) 137 | try: 138 | self.global_step = tf.train.create_global_step() 139 | except ValueError: 140 | self.global_step = tf.train.get_global_step() 141 | if not self.is_built: 142 | self._build_model(X, y) 143 | self.saver = tf.train.Saver() 144 | self._initialize() 145 | if len(X): 146 | if X_val is None and self.validation_fraction * len(X) > 2: 147 | X_train, X_val, y_train, y_val, sample_weight, _ = train_test_split( 148 | X, y, sample_weight, test_size=self.validation_fraction) 149 | else: 150 | X_train, y_train = X, y 151 | self._train_model(X_train, y_train, X_val=X_val, y_val=y_val, 152 | max_epochs=max_epochs, batch_size=batch_size, 153 | sources=sources, sample_weight=sample_weight) 154 | if save and self.address is not None: 155 | self.saver.save(self.sess, self.address) 156 | 157 | def _train_model(self, X, y, X_val, y_val, max_epochs, batch_size, 158 | sources=None, sample_weight=None): 159 | 160 | 161 | assert len(X)==len(y), 'Input and labels not the same size' 162 | self.history = {'metrics':[], 'idxs':[]} 163 | stop_counter = 0 164 | best_performance = None 165 | for epoch in range(max_epochs): 166 | vals_metrics, idxs = self._one_epoch( 167 | X, y, X_val, y_val, batch_size, sources=sources, sample_weight=sample_weight) 168 | self.history['idxs'].append(idxs) 169 | self.history['metrics'].append(vals_metrics) 170 | if self.early_stopping and X_val is not None: 171 | current_performance = np.mean(val_acc) 172 | if best_performance is None: 173 | best_performance = current_performance 174 | if current_performance > best_performance: 175 | best_performance = current_performance 176 | stop_counter = 0 177 | else: 178 | stop_counter += 1 179 | if stop_counter > self.early_stopping: 180 | break 181 | 182 | def _one_epoch(self, X, y, X_val, y_val, batch_size, sources=None, sample_weight=None): 183 | 184 | vals = [] 185 | if sources is None: 186 | if sample_weight is None: 187 | idxs = np.random.permutation(len(X)) 188 | else: 189 | idxs = np.random.choice(len(X), len(X), p=sample_weight/np.sum(sample_weight)) 190 | batches = [idxs[k*batch_size:(k+1) * batch_size] 191 | for k in range(int(np.ceil(len(idxs)/batch_size)))] 192 | idxs = batches 193 | else: 194 | idxs = np.random.permutation(len(sources.keys())) 195 | batches = [sources[i] for i in idxs] 196 | for batch_counter, batch in enumerate(batches): 197 | self.sess.run(self.train_op, 198 | {self.input_ph:X[batch], self.labels:y[batch], 199 | self.dropout_ph:self.dropout}) 200 | if X_val is not None: 201 | if self.metric=='accuracy': 202 | vals.append(self.score(X_val, y_val)) 203 | elif self.metric=='f1': 204 | vals.append(f1_score(y_val, self.predict(X_val))) 205 | elif self.metric=='auc': 206 | vals.append(roc_auc_score(y_val, self.predict_proba(X_val)[:,1])) 207 | elif self.metric=='xe': 208 | vals.append(-self.prediction_cost(X_val, y_val)) 209 | return np.array(vals), np.array(idxs) 210 | 211 | def _initialize(self): 212 | 213 | uninitialized_vars = [] 214 | if self.warm_start: 215 | for var in tf.global_variables(): 216 | try: 217 | self.sess.run(var) 218 | except tf.errors.FailedPreconditionError: 219 | uninitialized_vars.append(var) 220 | else: 221 | uninitialized_vars = tf.global_variables() 222 | self.sess.run(tf.initializers.variables(uninitialized_vars)) 223 | 224 | def _build_model(self, X, y): 225 | 226 | # self.num_classes = len(set(y)) 227 | if self.initializer is None: 228 | initializer = tf.initializers.variance_scaling(distribution='uniform') 229 | if self.activation is None: 230 | activation = lambda x: tf.nn.relu(x) 231 | self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input') 232 | self.dropout_ph = tf.placeholder_with_default( 233 | tf.constant(0., dtype=tf.float32), shape=(), name='dropout') 234 | if self.mode=='regression': 235 | self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label') 236 | else: 237 | self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label') 238 | x = tf.reshape(self.input_ph, shape=(-1, np.prod(X.shape[1:]))) 239 | for layer, hidden_unit in enumerate(self.hidden_units): 240 | with tf.variable_scope('dense_{}'.format(layer)): 241 | x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 242 | initializer=self.initializer, activation=activation) 243 | with tf.variable_scope('final'): 244 | self.prelogits = x 245 | self._final_layer(self.prelogits, self.num_classes, self.mode) 246 | self._build_train_op() 247 | 248 | def _build_train_op(self): 249 | 250 | """Build taining specific ops for the graph.""" 251 | learning_rate = tf.constant(self.learning_rate, tf.float32) ##fixit 252 | trainable_variables = tf.trainable_variables() 253 | grads = tf.gradients(self.loss, trainable_variables) 254 | self.grad_flat = tf.concat([tf.reshape(grad, (-1, 1)) for grad in grads], axis=0) 255 | if self.optimizer == 'sgd': 256 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 257 | elif self.optimizer == 'mom': 258 | optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) 259 | elif self.optimizer == 'adam': 260 | optimizer = tf.train.AdamOptimizer(learning_rate) 261 | apply_op = optimizer.apply_gradients( 262 | zip(grads, trainable_variables), 263 | global_step=self.global_step, name='train_step') 264 | train_ops = [apply_op] + self._extra_train_ops + tf.get_collection(tf.GraphKeys.UPDATE_OPS) 265 | previous_ops = [tf.group(*train_ops)] 266 | with tf.control_dependencies(previous_ops): 267 | self.train_op = tf.no_op(name='train') 268 | self.is_built = True 269 | 270 | def _final_layer(self, x, num_classes, mode): 271 | 272 | if mode=='regression': 273 | self.logits = self._dense(x, 1, dropout=self.dropout_ph) 274 | self.predictions = tf.reduce_sum(self.logits, axis=-1) 275 | regression_loss = tf.nn.l2_loss(self.predictions - self.labels) ##FIXIT 276 | self.prediction_loss = tf.reduce_mean(regression_loss, name='l2') 277 | residuals = self.predictions - self.labels 278 | var_predicted = tf.reduce_mean(residuals**2) - tf.reduce_mean(residuals)**2 279 | var_labels = tf.reduce_mean(self.labels**2) - tf.reduce_mean(self.labels)**2 280 | self.prediction_score = 1 - var_predicted/(var_labels + 1e-12) 281 | else: 282 | self.logits = self._dense(x, num_classes, dropout=self.dropout_ph) 283 | self.probs = tf.nn.softmax(self.logits) 284 | xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 285 | logits=self.logits, labels=tf.cast(self.labels, tf.int32)) 286 | self.prediction_loss = tf.reduce_mean(xent_loss, name='xent') 287 | self.predictions = tf.argmax(self.probs, axis=-1, output_type=tf.int32) 288 | correct_predictions = tf.equal(self.predictions, self.labels) 289 | self.prediction_score = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) 290 | self.loss = self.prediction_loss + self._reg_loss() 291 | 292 | def _dense(self, x, out_dim, dropout=tf.constant(0.), initializer=None, activation=None): 293 | 294 | if initializer is None: 295 | initializer = tf.initializers.variance_scaling(distribution='uniform') 296 | w = tf.get_variable('DW', [x.get_shape()[1], out_dim], initializer=initializer) 297 | b = tf.get_variable('Db', [out_dim], initializer=tf.constant_initializer()) 298 | x = tf.nn.dropout(x, 1. - dropout) 299 | if activation: 300 | x = activation(x) 301 | return tf.nn.xw_plus_b(x, w, b) 302 | 303 | def _reg_loss(self, order=2): 304 | """Regularization loss for weight decay.""" 305 | losss = [] 306 | for var in tf.trainable_variables(): 307 | if var.op.name.find(r'DW') > 0 or var.op.name.find(r'CW') > 0: ##FIXIT 308 | if order==2: 309 | losss.append(tf.nn.l2_loss(var)) 310 | elif order==1: 311 | losss.append(tf.abs(var)) 312 | else: 313 | raise ValueError("Invalid regularization order!") 314 | return tf.multiply(self.weight_decay, tf.add_n(losss)) 315 | 316 | 317 | class CShapNN(ShapNN): 318 | 319 | def __init__(self, mode, hidden_units=[100], kernel_sizes=[], 320 | strides=None, channels=[], learning_rate=0.001, 321 | dropout = 0., activation=None, initializer=None, global_averaging=False, 322 | weight_decay=0.0001, optimizer='adam', batch_size=128, 323 | warm_start=False, max_epochs=100, validation_fraction=0.1, 324 | early_stopping=0, address=None, test_batch_size=1000, random_seed=666): 325 | 326 | self.mode = mode 327 | self.test_batch_size = test_batch_size 328 | self.kernels = []#FIXIT 329 | self.kernel_sizes = kernel_sizes 330 | self.channels = channels 331 | self.global_averaging = global_averaging 332 | assert len(channels)==len(kernel_sizes), 'Invalid channels or kernel_sizes' 333 | if strides is None: 334 | self.strides = [1] * len(kernel_sizes) 335 | else: 336 | self.strides = strides 337 | self.batch_size = batch_size 338 | self.hidden_units = hidden_units 339 | self.initializer = initializer 340 | self.activation = activation 341 | self.dropout = dropout 342 | self.weight_decay = weight_decay 343 | self.optimizer = optimizer 344 | self.learning_rate = learning_rate 345 | self.warm_start = warm_start 346 | self.max_epochs = max_epochs 347 | self.early_stopping = early_stopping 348 | self.validation_fraction = validation_fraction 349 | self.address = address 350 | self._extra_train_ops = [] 351 | self.random_seed = random_seed 352 | self.graph = tf.Graph() 353 | self.is_built = False 354 | with self.graph.as_default(): 355 | config = tf.ConfigProto() 356 | config.gpu_options.allow_growth=True 357 | self.sess = tf.Session(config=config) 358 | 359 | def _conv(self, x, filter_size, out_filters, strides, activation=None): 360 | 361 | in_filters = int(x.get_shape()[-1]) 362 | n = filter_size * filter_size * out_filters 363 | kernel = tf.get_variable( 364 | 'DW', [filter_size, filter_size, in_filters, out_filters], 365 | tf.float32, initializer=tf.random_normal_initializer( 366 | stddev=np.sqrt(2.0/n))) 367 | self.kernels.append(kernel) 368 | x = tf.nn.conv2d(x, kernel, strides, padding='SAME') 369 | if activation: 370 | x = activation(x) 371 | return x 372 | 373 | def _stride_arr(self, stride): 374 | 375 | if isinstance(stride, int): 376 | return [1, stride, stride, 1] 377 | if len(stride)==2: 378 | return [1, stride[0], stride[1], 1] 379 | if len(stride)==4: 380 | return stride 381 | raise ValueError('Invalid value!') 382 | 383 | def _build_model(self, X, y): 384 | 385 | 386 | if self.initializer is None: 387 | initializer = tf.initializers.variance_scaling(distribution='uniform') 388 | if self.activation is None: 389 | activation = lambda x: tf.nn.relu(x) 390 | self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input') 391 | self.dropout_ph = tf.placeholder_with_default( 392 | tf.constant(0., dtype=tf.float32), shape=(), name='dropout') 393 | if self.mode=='regression': 394 | self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label') 395 | else: 396 | self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label') 397 | if len(X.shape[1:]) == 2: 398 | x = tf.reshape(self.input_ph, [-1, X.shape[0], X.shape[1], 1]) 399 | else: 400 | x = self.input_ph 401 | for layer, (kernel_size, channels, stride) in enumerate(zip( 402 | self.kernel_sizes, self.channels, self.strides)): 403 | with tf.variable_scope('conv_{}'.format(layer)): 404 | x = self._conv(x, kernel_size, channels, self._stride_arr(stride), activation=activation) 405 | if self.global_averaging: 406 | x = tf.reduce_mean(x, axis=(1,2)) 407 | else: 408 | x = tf.reshape(x, shape=(-1, np.prod(x.get_shape()[1:]))) 409 | for layer, hidden_unit in enumerate(self.hidden_units): 410 | with tf.variable_scope('dense_{}'.format(layer)): 411 | x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 412 | initializer=self.initializer, activation=activation) 413 | 414 | with tf.variable_scope('final'): 415 | self.prelogits = x 416 | self._final_layer(self.prelogits, len(set(y)), self.mode) 417 | self._build_train_op() 418 | -------------------------------------------------------------------------------- /use_case/DataAcquisition/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import sys 5 | import time 6 | import numpy as np 7 | import tensorflow as tf 8 | import matplotlib.pyplot as plt 9 | from tqdm import tqdm_notebook 10 | from collections import OrderedDict 11 | from scipy.misc import toimage 12 | import torch 13 | import cv2 14 | import acoustics 15 | import h5py 16 | from torch.utils.data import Dataset, DataLoader 17 | from skimage import io, transform 18 | from torchvision import transforms, utils 19 | import torch.optim as optim 20 | import torch.nn as nn 21 | import glob 22 | import re 23 | from PIL import Image 24 | 25 | 26 | class CelebaDataset(Dataset): 27 | def __init__(self, label_file, root_dir, transform=None): 28 | # root_dir = "data/celeba/img_align_celeba/" 29 | self.labels, self.image_idxs = self.load_labels(label_file) 30 | self.root_dir = root_dir 31 | self.transform = transform 32 | 33 | def __len__(self): 34 | return len(self.labels) 35 | 36 | def __getitem__(self, idx): 37 | if torch.is_tensor(idx): 38 | idx = idx.tolist() 39 | img_name = os.path.join(self.root_dir, self.image_idxs[idx]) 40 | image = io.imread(img_name) 41 | # image = cv2.resize(image, (299, 299), interpolation=cv2.INTER_CUBIC) 42 | sample = {'image': image, 'label': self.labels[idx]} 43 | if self.transform: 44 | sample = self.transform(sample) 45 | return sample 46 | 47 | def load_labels(self, label_file): 48 | # label_file="list_attr_celeba.csv" 49 | dir_anno = "data/celeba/" 50 | file = open(dir_anno + label_file, 'r') 51 | texts = file.read().split("\n") 52 | file.close() 53 | col_names = texts[0].split(",") 54 | Male_idx = col_names.index("Male") 55 | gender_list = [] 56 | image_index_list = [] 57 | for txt in texts[1:]: 58 | image_index_list.append(txt.split(',')[0]) 59 | if txt.split(',')[Male_idx] == '1': 60 | gender_list.append(np.array(1)) 61 | elif txt.split(',')[Male_idx] == '-1': 62 | gender_list.append(np.array(0)) 63 | print(gender_list[:5], len(gender_list)) 64 | gener_list = np.array(gender_list) 65 | return gender_list, image_index_list 66 | 67 | class Rescale(object): 68 | """Rescale the image in a sample to a given size. 69 | 70 | Args: 71 | output_size (tuple or int): Desired output size. If tuple, output is 72 | matched to output_size. If int, smaller of image edges is matched 73 | to output_size keeping aspect ratio the same. 74 | """ 75 | def __init__(self, output_size): 76 | assert isinstance(output_size, (int, tuple)) 77 | self.output_size = output_size 78 | def __call__(self, sample): 79 | image, labels = sample['image'], sample['label'] 80 | h, w = image.shape[:2] 81 | if isinstance(self.output_size, int): 82 | if h > w: 83 | new_h, new_w = self.output_size * h / w, self.output_size 84 | else: 85 | new_h, new_w = self.output_size, self.output_size * w / h 86 | else: 87 | new_h, new_w = self.output_size 88 | 89 | new_h, new_w = int(new_h), int(new_w) 90 | img = transform.resize(image, (new_h, new_w)) 91 | # h and w are swapped for landmarks because for images, 92 | # x and y axes are axis 1 and 0 respectively 93 | return {'image': img, 'label': labels} 94 | 95 | class ToTensor(object): 96 | """Convert ndarrays in sample to Tensors.""" 97 | def __call__(self, sample): 98 | image, labels = sample['image'], sample['label'] 99 | 100 | # swap color axis because 101 | # numpy image: H x W x C 102 | # torch image: C X H X W 103 | image = image.transpose((2, 0, 1)) 104 | return {'image': torch.from_numpy(image), 105 | 'label': torch.from_numpy(labels)} 106 | 107 | class MNIST(): 108 | def __init__(self, one_hot=True, shuffle=False, by_label=False): 109 | self.x_train, self.y_train, self.x_test, self.y_test = self.load_data(one_hot, by_label) 110 | self.num_train = self.x_train.shape[0] 111 | self.num_test = self.x_test.shape[0] 112 | if shuffle: self.shuffle_data() 113 | 114 | def load_data(self, one_hot, by_label): 115 | mnist = tf.keras.datasets.mnist 116 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 117 | x_train = np.reshape(x_train, [-1, 28, 28, 1]) 118 | x_train = x_train.astype(np.float32) / 255 119 | x_test = np.reshape(x_test, [-1, 28, 28, 1]) 120 | x_test = x_test.astype(np.float32) / 255 121 | 122 | if by_label: 123 | ind_train = np.argsort(y_train) 124 | ind_test = np.argsort(y_test) 125 | x_train, y_train = x_train[ind_train], y_train[ind_train] 126 | x_test, y_test = x_test[ind_test], y_test[ind_test] 127 | 128 | 129 | if one_hot: 130 | # convert to one-hot labels 131 | y_train = tf.keras.utils.to_categorical(y_train) 132 | y_test = tf.keras.utils.to_categorical(y_test) 133 | 134 | return x_train, y_train, x_test, y_test 135 | 136 | 137 | def shuffle_data(self): 138 | ind = np.random.permutation(self.num_train) 139 | self.x_train, self.y_train = self.x_train[ind], self.y_train[ind] 140 | 141 | 142 | class CIFAR10(): 143 | def __init__(self, one_hot=True, shuffle=False): 144 | self.x_train, self.y_train, self.x_test, self.y_test = self.load_data(one_hot) 145 | self.num_train = self.x_train.shape[0] 146 | self.num_test = self.x_test.shape[0] 147 | 148 | if shuffle: self.shuffle_data() 149 | 150 | def load_data(self, one_hot): 151 | cifar = tf.keras.datasets.cifar10 152 | (x_train, y_train), (x_test, y_test) = cifar.load_data() 153 | # x_train.shape = (50000, 32, 32, 3), range = [0, 255] 154 | # y_train.shape = (50000, 1) 155 | 156 | y_train = np.squeeze(y_train) 157 | y_test = np.squeeze(y_test) 158 | x_train = x_train.astype(np.float32) / 255 159 | x_test = x_test.astype(np.float32) / 255 160 | 161 | if one_hot: 162 | # convert to one-hot labels 163 | y_train = tf.keras.utils.to_categorical(y_train) 164 | y_test = tf.keras.utils.to_categorical(y_test) 165 | 166 | return x_train, y_train, x_test, y_test 167 | 168 | 169 | def shuffle_data(self): 170 | ind = np.random.permutation(self.num_train) 171 | self.x_train, self.y_train = self.x_train[ind], self.y_train[ind] 172 | 173 | 174 | 175 | class Logger: 176 | def __init__(self, name='model', fmt=None, base="./logs"): 177 | self.handler = True 178 | self.scalar_metrics = OrderedDict() 179 | self.fmt = fmt if fmt else dict() 180 | if not os.path.exists(base): os.makedirs(base) 181 | self.path = os.path.join(base, name + "_" + str(time.time())) 182 | self.logs = self.path + '.csv' 183 | self.output = self.path + '.out' 184 | 185 | 186 | def prin(*args): 187 | str_to_write = ' '.join(map(str, args)) 188 | with open(self.output, 'a') as f: 189 | f.write(str_to_write + '\n') 190 | f.flush() 191 | 192 | print(str_to_write) 193 | sys.stdout.flush() 194 | 195 | self.print = prin 196 | 197 | def add_scalar(self, t, key, value): 198 | if key not in self.scalar_metrics: 199 | self.scalar_metrics[key] = [] 200 | self.scalar_metrics[key] += [(t, value)] 201 | 202 | def iter_info(self, order=None): 203 | names = list(self.scalar_metrics.keys()) 204 | if order: 205 | names = order 206 | values = [self.scalar_metrics[name][-1][1] for name in names] 207 | t = int(np.max([self.scalar_metrics[name][-1][0] for name in names])) 208 | fmt = ['%s'] + [self.fmt[name] if name in self.fmt else '.1f' for name in names] 209 | 210 | if self.handler: 211 | self.handler = False 212 | self.print(tabulate([[t] + values], ['epoch'] + names, floatfmt=fmt)) 213 | else: 214 | self.print(tabulate([[t] + values], ['epoch'] + names, tablefmt='plain', floatfmt=fmt).split('\n')[1]) 215 | 216 | def save(self): 217 | result = None 218 | for key in self.scalar_metrics.keys(): 219 | if result is None: 220 | result = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t') 221 | else: 222 | df = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t') 223 | result = result.join(df, how='outer') 224 | result.to_csv(self.logs) 225 | 226 | self.print('The log/output have been saved to: ' + self.path + ' + .csv/.out') 227 | 228 | class ImageNet(): 229 | def __init__(self, path, one_hot=True, shuffle=False): 230 | self.x_train, self.y_train, self.x_test, self.y_test = self.load_data(path, one_hot) 231 | self.num_train = self.x_train.shape[0] 232 | self.num_test = self.x_test.shape[0] 233 | if shuffle: self.shuffle_data() 234 | 235 | 236 | def load_data(self, path, one_hot): 237 | dog_fish = np.load(os.path.join(path, 'dataset_dog-fish_train-900_test-300.npz')) 238 | x_test = dog_fish[dog_fish.files[0]] 239 | x_train = dog_fish[dog_fish.files[1]] 240 | y_train = dog_fish[dog_fish.files[2]] 241 | y_test = dog_fish[dog_fish.files[3]] 242 | 243 | 244 | if one_hot: 245 | # convert to one-hot labels 246 | y_train = tf.keras.utils.to_categorical(y_train) 247 | y_test = tf.keras.utils.to_categorical(y_test) 248 | return x_train, y_train, x_test, y_test 249 | 250 | def shuffle_data(self): 251 | ind = np.random.permutation(self.num_train) 252 | self.x_train, self.y_train = self.x_train[ind], self.y_train[ind] 253 | 254 | def add_noise(data, bs, target_snr, noise_type): 255 | if noise_type == 'white': 256 | noise = acoustics.generator.white(bs*28*28).reshape(28, 28, bs) 257 | if noise_type == 'pink': 258 | noise = acoustics.generator.pink(bs*28*28).reshape(28, 28, bs) 259 | if noise_type == 'Violet': 260 | noise = acoustics.generator.violet(bs*28*28).reshape(28, 28, bs) 261 | 262 | 263 | 264 | print ('data shape = ', data.shape) 265 | average = np.mean(data) 266 | std = np.std(noise) 267 | current_snr = average/std 268 | noise = noise * (current_snr/ target_snr) 269 | data = data + noise 270 | return data 271 | 272 | def test_mnist(): 273 | print ("Testing MNIST dataloader...") 274 | data = MNIST() 275 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 276 | data = MNIST(one_hot=False) 277 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 278 | print (data.y_train[0:10]) 279 | data = MNIST(shuffle=True, one_hot=False) 280 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 281 | print (data.y_train[0:10]) 282 | data = MNIST(one_hot=False) 283 | fig=plt.figure(figsize=(8, 8)) 284 | for i in range(1, 6): 285 | # img = data.x_train[i].reshape(1,28,28).transpose([1, 2, 0]) 286 | 287 | img = data.x_train[i] 288 | img = add_noise(img, 1, 0.2, 'white') 289 | fig.add_subplot( 1, 5, i) 290 | plt.imshow(img.squeeze()) 291 | plt.show() 292 | 293 | def test_cifar10(): 294 | print ("Testing CIFAR10 dataloader...") 295 | data = CIFAR10() 296 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 297 | data = CIFAR10(one_hot=False) 298 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 299 | print (data.y_train[0:10]) 300 | data = CIFAR10(shuffle=True, one_hot=False) 301 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 302 | print (data.y_train[0:10]) 303 | fig=plt.figure(figsize=(8, 8)) 304 | for i in range(1, 6): 305 | img = data.x_train[i] * 255 306 | fig.add_subplot( 1, 5, i) 307 | plt.imshow(img.astype(np.uint8)) 308 | plt.show() 309 | 310 | 311 | def test_imagenet(): 312 | print("Testing ImageNet dataloader...") 313 | data = ImageNet('./data') 314 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 315 | data = ImageNet(path='./data', one_hot=False) 316 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 317 | print (data.y_train[0:10]) 318 | data = ImageNet(path='./data', shuffle=True, one_hot=False) 319 | print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape) 320 | print (data.y_train[0:10]) 321 | fig=plt.figure(figsize=(8, 8)) 322 | for i in range(1, 6): 323 | # img = data.x_train[i].reshape(3,299,299).transpose(1,2,0).astype("float") 324 | img = data.x_train[i] * -255 325 | fig.add_subplot( 1, 5, i) 326 | plt.imshow((img.squeeze()* 255).astype(np.uint8), interpolation='nearest') 327 | plt.show() 328 | 329 | 330 | def train(model, device, x_train, y_train, batch_size, optimizer, criterion, n_epochs): 331 | model.train() 332 | for epoch in tqdm_notebook(range(n_epochs), desc = 'Epochs'): 333 | # print("epoch model.fc.weight:") 334 | # print(epoch, model.fc.weight) 335 | for X, y in batch(x_train, y_train, batch_size): 336 | X, y = X.to(device).float(), y.to(device) 337 | # print(X.shape, y.shape) 338 | optimizer.zero_grad() 339 | # y_pred = model(X) 340 | *_, y_pred = model(X) 341 | loss = criterion(y_pred, y) 342 | loss.backward() 343 | # for param in model.parameters(): 344 | # print(param.grad.data.sum()) 345 | optimizer.step() 346 | # if(n_epochs > 4): 347 | # if(epoch % int(n_epochs/4) == 0): 348 | # print(f'Train epoch {epoch}: Loss: {loss.item():7.4f}') 349 | 350 | def evaluate(model, device, x_test, y_test, batch_size, criterion): 351 | model.eval() 352 | test_set_size = len(x_test) 353 | correct_answers = 0 354 | sum_loss = 0 355 | with torch.no_grad(): 356 | for X, y in batch(x_test, y_test, batch_size): 357 | X, y = X.to(device).float(), y.to(device) 358 | *_, y_pred = model(X) 359 | # y_pred = model(X) 360 | 361 | class_pred = y_pred.argmax(dim = 1) 362 | correct_answers += (y == class_pred).float().sum().item() 363 | sum_loss += criterion(y_pred, y).item() 364 | accuracy = correct_answers / test_set_size 365 | average_loss = sum_loss / len(x_test) 366 | 367 | return accuracy, average_loss 368 | 369 | def evaluate_adv(model, device, x_test, y_test, batch_size, criterion): 370 | model.eval() 371 | test_set_size = len(x_test) 372 | correct_answers = 0 373 | sum_loss = 0 374 | idx = 0 375 | idxs = [] 376 | falses = [] 377 | ground_truths = [] 378 | with torch.no_grad(): 379 | for X, y in tqdm_notebook(batch(x_test, y_test, batch_size), total = int(len(x_test)/batch_size)): 380 | X, y = X.to(device).float(), y.to(device) 381 | *_, y_pred = model(X) 382 | class_pred = y_pred.argmax(dim = 1) 383 | correct_answers += (y == class_pred).float().sum().item() 384 | # print(y) 385 | # print(class_pred) 386 | if( y != class_pred): 387 | idxs.append(idx) 388 | falses.append(class_pred) 389 | ground_truths.append(y) 390 | idx += 1 391 | sum_loss += criterion(y_pred, y).item() 392 | accuracy = correct_answers / test_set_size 393 | average_loss = sum_loss / len(x_test) 394 | 395 | return accuracy, average_loss, falses, ground_truths, idxs 396 | 397 | def knn_shapley(K, trainX, valX, trainy, valy): 398 | N = trainX.shape[0] 399 | M = valX.shape[0] 400 | c = 1 401 | # value = np.zeros(N) 402 | value = [[] for i in range(N) ] 403 | scores = [] 404 | false_result_idxs = [] 405 | for i in tqdm_notebook(range(M), total=M, leave=False): 406 | X = valX[i] 407 | y = valy[i] 408 | 409 | s = np.zeros(N) 410 | diff = (trainX - X).reshape(N, -1) # calculate the distances between valX and every trainX data point 411 | dist = np.einsum('ij, ij->i', diff, diff) # output the sum distance 412 | idx = np.argsort(dist) # ascend the distance 413 | ans = trainy[idx] 414 | 415 | # calculate test performance 416 | score = 0.0 417 | 418 | for j in range(min(K, N)): 419 | score += float(ans[j] == y) 420 | if(score > min(K, N)/2): 421 | scores.append(1) 422 | else: 423 | scores.append(0) 424 | false_result_idxs.append(i) 425 | 426 | s[idx[N - 1]] = float(ans[N - 1] == y)*c / N 427 | cur = N - 2 428 | for j in range(N - 1): 429 | s[idx[cur]] = s[idx[cur + 1]] + float(int(ans[cur] == y) - int(ans[cur + 1] == y))*c / K * (min(cur, K - 1) + 1) / (cur + 1) 430 | cur -= 1 431 | 432 | for j in range(N): 433 | value[j].append(s[j]) 434 | # for i in range(N): 435 | # value[i] /= M 436 | return value, np.mean(scores), false_result_idxs 437 | 438 | def old_knn_shapley(K, trainX, valX, trainy, valy): 439 | N = trainX.shape[0] 440 | M = valX.shape[0] 441 | c = 1 442 | value = np.zeros(N) 443 | # value = [[] for i in range(N) ] 444 | scores = [] 445 | false_result_idxs = [] 446 | for i in tqdm_notebook(range(M), total=M, leave=False): 447 | X = valX[i] 448 | y = valy[i] 449 | 450 | s = np.zeros(N) 451 | diff = (trainX - X).reshape(N, -1) # calculate the distances between valX and every trainX data point 452 | dist = np.einsum('ij, ij->i', diff, diff) # output the sum distance 453 | idx = np.argsort(dist) # ascend the distance 454 | ans = trainy[idx] 455 | 456 | # calculate test performance 457 | score = 0.0 458 | 459 | for j in range(min(K, N)): 460 | score += float(ans[j] == y) 461 | if(score > min(K, N)/2): 462 | scores.append(1) 463 | else: 464 | scores.append(0) 465 | false_result_idxs.append(i) 466 | 467 | s[idx[N - 1]] = float(ans[N - 1] == y)*c / N 468 | cur = N - 2 469 | for j in range(N - 1): 470 | s[idx[cur]] = s[idx[cur + 1]] + float(int(ans[cur] == y) - int(ans[cur + 1] == y))*c / K * (min(cur, K - 1) + 1) / (cur + 1) 471 | cur -= 1 472 | 473 | for j in range(N): 474 | value[j] += s[j] 475 | for i in range(N): 476 | value[i] /= M 477 | return value, np.mean(scores), false_result_idxs 478 | 479 | 480 | 481 | def loo_knn_shapley(K, trainX, valX, trainy, valy): 482 | N = trainX.shape[0] 483 | M = valX.shape[0] 484 | value = np.zeros(N) 485 | scores = [] 486 | false_result_idxs = [] 487 | for i in tqdm_notebook(range(M), total=M, leave=False): 488 | X = valX[i] 489 | y = valy[i] 490 | 491 | s = np.zeros(N) 492 | diff = (trainX - X).reshape(N, -1) # calculate the distances between valX and every trainX data point 493 | dist = np.einsum('ij, ij->i', diff, diff) # output the sum distance 494 | idx = np.argsort(dist) # ascend the distance 495 | ans = trainy[idx] 496 | # print(y, ans[:10]) 497 | 498 | # calculate test performance 499 | score = 0.0 500 | 501 | for j in range(min(K, N)): 502 | score += float(ans[j] == y) 503 | if(score > min(K, N)/2): 504 | scores.append(1) 505 | else: 506 | scores.append(0) 507 | false_result_idxs.append(i) 508 | 509 | ### calculate LOO KNN values and do not concern the situation that K > N 510 | for j in range(N): 511 | if j in idx[:K]: 512 | # print(int(ans[j] == y), int(ans[K] == y)) 513 | # print(y, j, ans[j], K, ans[K]) 514 | s[j] = float(int(trainy[j] == y) - int(trainy[K] == y)) / K 515 | else: 516 | s[j] = 0 517 | 518 | 519 | for j in range(N): 520 | value[j] += s[j] 521 | for i in range(N): 522 | value[i] /= M 523 | return value, np.mean(scores), false_result_idxs 524 | 525 | 526 | 527 | def batch(x_batch, y_batch, batch_size=1): 528 | l = len(x_batch) 529 | for ndx in range(0, l, batch_size): 530 | yield x_batch[ndx:min(ndx + batch_size, l)], y_batch[ndx:min(ndx + batch_size, l)] 531 | 532 | def print_img(img): 533 | plt.imshow(img.squeeze()) 534 | plt.show() 535 | 536 | def resize_and_scale(img, size, scale): 537 | img = cv2.resize(img, size) 538 | return 1 - np.array(img, "float32")/scale 539 | 540 | def h5load(path): 541 | # data means x, target means y 542 | if(os.path.exists(path)): 543 | with h5py.File(path, 'r') as hf: 544 | X_tr = hf.get('data')[:] 545 | y_tr = hf.get('target')[:] 546 | return X_tr, y_tr 547 | 548 | def h5save(path, x, y): 549 | if(os.path.exists(path)): 550 | print("Already existed") 551 | return 552 | else: 553 | with h5py.File(path, 'w') as hf: 554 | hf.create_dataset("data", data=x, compression="gzip", compression_opts=9) 555 | print("Data saved!") 556 | hf.create_dataset("target", data=y, compression="gzip", compression_opts=9) 557 | print("Target saved!") 558 | 559 | return 560 | 561 | def cw_l2_attack(model, images, labels, device, targeted=False, c=1e-4, kappa=1, max_iter=1000, learning_rate=0.01) : 562 | images = images.to(device) 563 | labels = labels.to(device) 564 | # Define f-function 565 | def f(x) : 566 | *_, outputs = model(x) 567 | one_hot_labels = torch.eye(len(outputs[0]))[labels].to(device) 568 | i, _ = torch.max((1-one_hot_labels)*outputs, dim=1) 569 | j = torch.masked_select(outputs, one_hot_labels.byte()) 570 | # print(i,j) 571 | # If targeted, optimize for making the other class most likely 572 | if targeted : 573 | return torch.clamp(i-j, min=-kappa) 574 | # If untargeted, optimize for making the other class most likely 575 | else : 576 | return torch.clamp(j-i, min=-kappa) 577 | w = torch.zeros_like(images, requires_grad=True).to(device) 578 | optimizer = optim.Adam([w], lr=learning_rate) 579 | prev = 1e10 580 | for step in range(max_iter) : 581 | a = 1/2*(nn.Tanh()(w) + 1) 582 | loss1 = nn.MSELoss(reduction='sum')(a, images) 583 | loss2 = torch.sum(c*f(a)) 584 | cost = loss1 + loss2 585 | optimizer.zero_grad() 586 | cost.backward() 587 | # print(cost) 588 | optimizer.step() 589 | # Early Stop when loss does not converge. 590 | if step % (max_iter//10) == 0 : 591 | if cost > prev : 592 | print('Attack Stopped due to CONVERGENCE....') 593 | return a 594 | prev = cost 595 | print('- Learning Progress : %2.2f %% ' %((step+1)/max_iter*100), end='\r') 596 | attack_images = 1/2*(nn.Tanh()(w) + 1) 597 | return attack_images 598 | 599 | def load_filenames_labels(mode): 600 | """Gets filenames and labels 601 | Args: 602 | mode: 'train' or 'val' 603 | (Directory structure and file naming different for 604 | train and val datasets) 605 | Returns: 606 | list of tuples: (jpeg filename with path, label) 607 | """ 608 | label_dict, class_description = build_label_dicts() 609 | filenames_labels = [] 610 | if mode == 'train': 611 | filenames = glob.glob('data/tiny-imagenet-200/train/*/images/*.JPEG') 612 | for filename in filenames: 613 | match = re.search(r'n\d+', filename) 614 | label = str(label_dict[match.group()]) 615 | filenames_labels.append((filename, label)) 616 | elif mode == 'val': 617 | with open('data/tiny-imagenet-200/val/val_annotations.txt', 'r') as f: 618 | for line in f.readlines(): 619 | split_line = line.split('\t') 620 | filename = 'data/tiny-imagenet-200/val/images/' + split_line[0] 621 | label = str(label_dict[split_line[1]]) 622 | filenames_labels.append((filename, label)) 623 | 624 | return filenames_labels 625 | 626 | def build_label_dicts(): 627 | """Build look-up dictionaries for class label, and class description 628 | Class labels are 0 to 199 in the same order as 629 | tiny-imagenet-200/wnids.txt. Class text descriptions are from 630 | tiny-imagenet-200/words.txt 631 | Returns: 632 | tuple of dicts 633 | label_dict: 634 | keys = synset (e.g. "n01944390") 635 | values = class integer {0 .. 199} 636 | class_desc: 637 | keys = class integer {0 .. 199} 638 | values = text description from words.txt 639 | """ 640 | label_dict, class_description = {}, {} 641 | with open('data/tiny-imagenet-200/wnids.txt', 'r') as f: 642 | for i, line in enumerate(f.readlines()): 643 | synset = line[:-1] # remove \n 644 | label_dict[synset] = i 645 | with open('data/tiny-imagenet-200/words.txt', 'r') as f: 646 | for i, line in enumerate(f.readlines()): 647 | synset, desc = line.split('\t') 648 | desc = desc[:-1] # remove \n 649 | if synset in label_dict: 650 | class_description[label_dict[synset]] = desc 651 | 652 | return label_dict, class_description 653 | 654 | def load_tinyImagenet(dataset): 655 | dim = np.zeros((64,64)) 656 | imgs = [] 657 | labels = [] 658 | for path, label in dataset: 659 | img=np.array(Image.open(path)) /255.0 660 | # print(path, len(img.shape)) 661 | if(len(img.shape) != 3): 662 | img = np.stack((img, dim, dim), axis=2) 663 | imgs.append(img) 664 | labels.append(int(label)) 665 | return imgs, labels --------------------------------------------------------------------------------