├── result.png
├── use_case
    ├── Noisy Label, Watermarking
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── resnet.py
    │   ├── pytorch_fitmodule
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   └── fit_module.py
    │   ├── PlotRuntime.py
    │   ├── Label.py
    │   ├── Watermark.py
    │   ├── PlotAccuracy.py
    │   ├── PlotLabel.py
    │   ├── Poisoning.py
    │   ├── runtime.py
    │   ├── PlotPoisoning.py
    │   ├── shap_utils.py
    │   └── Shapley.py
    └── DataAcquisition
    │   ├── dknn.py
    │   ├── uci_knn.py
    │   ├── shap_utils.py
    │   ├── Shapley.py
    │   └── utils.py
├── .gitignore
├── README.md
├── exact_sp.py
├── exact_sp_example.py
├── LSH_sp_example.py
├── LSH_sp.py
└── reproduction
    ├── Cifar10
        └── accuracy.ipynb
    ├── YFCC100M
        └── testlsh.ipynb
    └── ImageNet
        └── accuracy.ipynb


/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI-secure/KNN-PVLDB/HEAD/result.png


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import *
2 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/pytorch_fitmodule/__init__.py:
--------------------------------------------------------------------------------
1 | from .fit_module import FitModule
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | ### Example user template template
 3 | ### Example user template
 4 | 
 5 | # IntelliJ project files
 6 | .idea
 7 | *.iml
 8 | out
 9 | gen
10 | __pycache__


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Valuation
 2 | 
 3 | This repo is the official code base for PVLDB paper "Efficient task-specific data valuation for nearest neighbor algorithms".
 4 | 
 5 | -----
 6 | 
 7 | It contains scripts to calculate exact Shapley value (in the `exact_sp.py`) and approximate Shapley value based on LSH (in the `LSH_sp.py`) for KNN classifier.
 8 | 
 9 | We also provide two examples about how to calculate exact Shapley value (in the `exact_sp_example.py`) and approximate Shapley value (in the `LSH_sp_example.py`) on Cifar-10 dataset.
10 | 
11 | In the reproduction folder, we provide our jupyter notebook scripts for tree datasets (Cifar-10, ImageNet, and YFCC100M), which recorded our experiment results, to help reproduce our experiments.
12 | 
13 | For example:
14 | ![result](result.png)
15 | 
16 | If you have any questions about our code, please do not hesitate to ask in the issues. Thanks!  
17 | 
18 |           


--------------------------------------------------------------------------------
/exact_sp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tqdm import tqdm
 3 | 
 4 | 
 5 | def get_true_KNN(x_trn, x_tst):
 6 |     N = x_trn.shape[0]
 7 |     N_tst = x_tst.shape[0]
 8 |     x_tst_knn_gt = np.zeros((N_tst, N))
 9 |     for i_tst in tqdm(range(N_tst)):
10 |         dist_gt = np.zeros(N)
11 |         for i_trn in range(N):
12 |             dist_gt[i_trn] = np.linalg.norm(x_trn[i_trn, :] - x_tst[i_tst, :], 2)
13 |         x_tst_knn_gt[i_tst, :] = np.argsort(dist_gt)
14 |     return x_tst_knn_gt.astype(int)
15 | 
16 | 
17 | def compute_single_unweighted_knn_class_shapley(x_trn, y_trn, x_tst_knn_gt, y_tst, K):
18 |     N = x_trn.shape[0]
19 |     N_tst = x_tst_knn_gt.shape[0]
20 |     sp_gt = np.zeros((N_tst, N))
21 |     for j in tqdm(range(N_tst)):
22 |         sp_gt[j, x_tst_knn_gt[j, -1]] = (y_trn[x_tst_knn_gt[j, -1]] == y_tst[j]) / N
23 |         for i in np.arange(N - 2, -1, -1):
24 |             sp_gt[j, x_tst_knn_gt[j, i]] = sp_gt[j, x_tst_knn_gt[j, i + 1]] + \
25 |                                            (int(y_trn[x_tst_knn_gt[j, i]] == y_tst[j]) -
26 |                                             int(y_trn[x_tst_knn_gt[j, i + 1]] == y_tst[j])) / K * min([K, i + 1]) / (
27 |                                                        i + 1)
28 |     return sp_gt
29 | 


--------------------------------------------------------------------------------
/exact_sp_example.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | from sklearn.utils import shuffle
 4 | from exact_sp import get_true_KNN, compute_single_unweighted_knn_class_shapley
 5 | 
 6 | data = np.load('CIFAR10_resnet50-keras_features.npz')
 7 | x_trn = np.vstack((data['features_training'], data['features_testing']))
 8 | y_trn = np.hstack((data['labels_training'], data['labels_testing']))
 9 | 
10 | x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0)
11 | 
12 | x_trn = np.reshape(x_trn, (-1, 2048))
13 | x_tst, y_tst = x_trn[:100], y_trn[:100]
14 | x_val, y_val = x_trn[100:1100], y_trn[100:1100]
15 | x_trn, y_trn = x_trn[1100:], y_trn[1100:]
16 | 
17 | # we are using 1-nn classifier
18 | K = 1
19 | 
20 | start = time.time()
21 | x_tst_knn_gt = get_true_KNN(x_trn, x_tst)
22 | end1 = time.time() - start
23 | print(end1)
24 | 
25 | start = time.time()
26 | x_val_knn_gt = get_true_KNN(x_trn, x_val)
27 | val_end1 = time.time() - start
28 | print(val_end1)
29 | 
30 | start = time.time()
31 | sp_gt = compute_single_unweighted_knn_class_shapley(x_trn, y_trn, x_tst_knn_gt, y_tst, K)
32 | end2 = time.time() - start
33 | 
34 | start = time.time()
35 | val_sp_gt = compute_single_unweighted_knn_class_shapley(x_trn, y_trn, x_val_knn_gt, y_val, K)
36 | val_end2 = time.time() - start
37 | 
38 | print(end2)
39 | print(val_end2)
40 | 
41 | print("time to get exact sp values for test set:", (end1 + end2) / len(x_tst))
42 | print("time to get exact sp values for val set:", (val_end1 + val_end2) / len(x_val))
43 | 
44 | np.save('tst_exact_sp_gt', sp_gt)
45 | np.save('val_exact_sp_gt', val_sp_gt)
46 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/PlotRuntime.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import os
 5 | import seaborn as sns
 6 | 
 7 | sns.set()
 8 | 
 9 | x = np.array([10, 100, 200, 400, 800, 1000, 5000, 10000, 20000, 50000])
10 | # knn = np.array([0.0003832538922627767, 0.004426650206247966, 0.01631486415863037, 0.06262378295262655, 0.25503607193628947, 0.3868168075879415, 6.302051556110382, 25.532700236638387, 102.27655944824218]) * 60
11 | # loo = np.array([0.02149387200673421, 0.5834330042203267, 2.0351594130198163, 6.966519888242086, 24.5041117866834, 37.45188350280126]) * 60
12 | # tmc = np.array([0.7461043953895569, 144.0786436120669]) * 60
13 | # g = np.array([0.5796960711479187, 3.785581676165263, 9.595915234088897, 14.533872322241466, 46.74548430840174, 57.338612226645154]) * 60
14 | 
15 | 
16 | knn = np.array([0.0769142468770345,   0.677141539255778,   1.653036856651306,   3.4390464584032694,  8.59050339460373,    12.708731484413146]) * 60
17 | loo = np.array([0.7347790956497192,   66.44814310471217]) * 60
18 | tmc = np.array([11.529986302057901]) * 60
19 | g = np.array([0.12539432843526205, 0.9315359711647033, 3.903498136997223, 9.672818299134573, 50.83118432760239,150.22751605113348]) * 60
20 | 
21 | 
22 | plt.loglog(x[0:loo.shape[0]], loo, '^-', color = 'olive', label = "Leave-One-Out")
23 | plt.loglog(x[0:tmc.shape[0]], tmc, 's-', color = 'blue', label = "TMC-Shapley")
24 | plt.loglog(x[0:knn.shape[0]], knn, 'o-', color='purple', label = 'KNN-Shapley')
25 | plt.loglog(x[0:g.shape[0]], g, 's-', color = 'orange', label = "G-Shapley")
26 | 
27 | plt.xlabel('Number of training data points in log scale')
28 | plt.ylabel('Running time in log scale (s)')
29 | plt.legend(loc='lower right')
30 | plt.show()


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/pytorch_fitmodule/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import torch
 4 | 
 5 | from functools import partial
 6 | from torch.utils.data import DataLoader, TensorDataset
 7 | 
 8 | 
 9 | ##### Data utils #####
10 | 
11 | def get_loader(X, y=None, batch_size=1, shuffle=False):
12 |     """Convert X and y Tensors to a DataLoader
13 |         
14 |         If y is None, use a dummy Tensor
15 |     """
16 |     if y is None:
17 |         y = torch.Tensor(X.size()[0])
18 |     return DataLoader(TensorDataset(X, y), batch_size, shuffle)
19 | 
20 | 
21 | ##### Logging #####
22 | 
23 | def add_metrics_to_log(log, metrics, y_true, y_pred, prefix=''):
24 |     for metric in metrics:
25 |         q = metric(y_true, y_pred)
26 |         log[prefix + metric.__name__] = q
27 |     return log
28 | 
29 | 
30 | def log_to_message(log, precision=4):
31 |     fmt = "{0}: {1:." + str(precision) + "f}"
32 |     return "    ".join(fmt.format(k, v) for k, v in log.items())
33 | 
34 | 
35 | class ProgressBar(object):
36 |     """Cheers @ajratner"""
37 | 
38 |     def __init__(self, n, length=40):
39 |         # Protect against division by zero
40 |         self.n      = max(1, n)
41 |         self.nf     = float(n)
42 |         self.length = length
43 |         # Precalculate the i values that should trigger a write operation
44 |         self.ticks = set([round(i/100.0 * n) for i in range(101)])
45 |         self.ticks.add(n-1)
46 |         self.bar(0)
47 | 
48 |     def bar(self, i, message=""):
49 |         """Assumes i ranges through [0, n-1]"""
50 |         if i in self.ticks:
51 |             b = int(np.ceil(((i+1) / self.nf) * self.length))
52 |             sys.stdout.write("\r[{0}{1}] {2}%\t{3}".format(
53 |                 "="*b, " "*(self.length-b), int(100*((i+1) / self.nf)), message
54 |             ))
55 |             sys.stdout.flush()
56 | 
57 |     def close(self, message=""):
58 |         # Move the bar to 100% before closing
59 |         self.bar(self.n-1)
60 |         sys.stdout.write("{0}\n\n".format(message))
61 |         sys.stdout.flush()
62 | 


--------------------------------------------------------------------------------
/use_case/DataAcquisition/dknn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import numpy as np
 5 | import sklearn
 6 | from utils import *
 7 | from Dknn import *
 8 | from plot import *
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | import torch.optim as optim
13 | from tqdm import tqdm, tqdm_notebook
14 | 
15 | batch_size = 32
16 | data = MNIST(one_hot=False)
17 | device = torch.device('cuda')
18 | 
19 | #cnn = CNN().to(device)
20 | #optimizer = optim.Adam(cnn.parameters())
21 | #criterion = nn.CrossEntropyLoss()
22 | print('---1. load data---')
23 | x_train = torch.from_numpy(data.x_train).view(-1, 28, 28).unsqueeze(1).unsqueeze(1)
24 | y_train = torch.from_numpy(data.y_train).view(-1,1).long()
25 | 
26 | x_test = torch.from_numpy(data.x_test).view(-1, 28, 28).unsqueeze(1).unsqueeze(1)
27 | y_test = torch.from_numpy(data.y_test).view(-1,1).long()
28 | 
29 | #train(cnn, device, x_train, y_train, optimizer, criterion, 1, len(data.x_train) // 5)
30 | 
31 | #accuracy, avg_loss = evaluate(cnn, device, x_train, y_train, criterion)
32 | #print(f'[Train] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}')
33 | #accuracy, avg_loss = evaluate(cnn, device, x_test, y_test, criterion)
34 | #print(f'[Test] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}')
35 | print('---2. build cnn model and calculate deep features---')
36 | deep_feats = []
37 | targets = []
38 | 
39 | cnn = CNN().to(device)
40 | optimizer = optim.Adam(cnn.parameters())
41 | criterion = nn.CrossEntropyLoss()
42 | 
43 | for i, (X, y) in tqdm_notebook(enumerate(zip(x_train, y_train)), total = len(x_train)):
44 |     X = X.to(device)
45 |     deep_feat, y_pre = cnn(X)
46 |     deep_feats.append(deep_feat.view(deep_feat.size(0), -1).cpu().detach().numpy())
47 |     targets.append(y.numpy())
48 | deep_feats = np.concatenate(deep_feats) # deep features are not normalized
49 | targets = np.concatenate(targets)
50 | print(deep_feats[:2])
51 | print(deep_feats.shape, targets.shape)  
52 | 
53 | print('---3. calculate knn shapley---')
54 | train_size = 1000
55 | k = 4
56 | knn_values = [[] for _ in range(k)]
57 | sx_train, sy_train = x_train[:train_size], y_train[:train_size]
58 | sx_test, sy_test = x_test[-train_size:], y_test[-train_size:]
59 | 
60 | for i in range(k):
61 |     print("neighbour number:", i+1)
62 |     knn_values[i] = knn_shapley(i+1, deep_feats[:train_size], deep_feats[train_size:train_size*2], 
63 |                                   targets[:train_size], targets[train_size:train_size*2])
64 | print(len(knn_values[0]))
65 | print(knn_values[0][:10])
66 | print('---4. draw plot---')
67 | plot_knn(knn_values, sx_train, sy_train, sx_test, sy_test, deep_feats)
68 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/Label.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | from Shapley import ShapNN
 6 | from DShap import DShap
 7 | from tensorflow.examples.tutorials.mnist import input_data
 8 | import pickle
 9 | import argparse
10 | import copy
11 | import random
12 | 
13 | parser = argparse.ArgumentParser(description = None)
14 | parser.add_argument('--num', type=int, required = True)
15 | args = parser.parse_args()
16 | 
17 | x = args.num
18 | 
19 | fashion = input_data.read_data_sets("fashion_data/", one_hot=True)  
20 | 
21 | X_data = []
22 | y_data = []
23 | 
24 | for _x, _y in zip(fashion.train.images, np.argmax(fashion.train.labels, axis=1)):
25 |     if _y == 0:
26 |         X_data.append(_x)
27 |         y_data.append(0)
28 |     elif _y == 6:
29 |         X_data.append(_x)
30 |         y_data.append(1)
31 | X_data = np.array(X_data)
32 | y_data = np.array(y_data)
33 | 
34 | X_test_data = X_data[x:x+x//10]
35 | y_test_data = y_data[x:x+x//10]
36 | X_data = X_data[0:x]
37 | y_data = y_data[0:x]
38 | y_data_orig = copy.deepcopy(y_data)
39 | 
40 | X_benign = []
41 | y_benign = []
42 | 
43 | X_flip = []
44 | y_flip = []
45 | 
46 | flip = np.zeros(x)
47 | for i in range(x // 10):
48 |     j = np.random.randint(0, x)
49 |     while flip[j] == 1:
50 |         j = np.random.randint(0, x)
51 |     flip[j] = 1
52 |     y_data[j] = 1 - y_data[j]
53 |     X_flip.append(X_data[j])
54 |     y_flip.append(y_data[j])
55 | for i in range(x):
56 |     if flip[i] == 0:
57 |         X_benign.append(X_data[i])
58 |         y_benign.append(y_data[i])
59 | pickle.dump(flip, open('flip.pkl', 'wb'))
60 | 
61 | # dshap = DShap(X=X_data,
62 | #               y=y_data_orig,
63 | #               X_test=X_test_data,
64 | #               y_test=y_test_data,
65 | #               num_test=x//10,
66 | #               model_family='NN',
67 | #               nodump=True)
68 | # dshap.model.fit(X_data, y_data_orig)
69 | # print("Original model training accuracy for benign data: %g" % dshap.model.score(X_benign, y_benign))
70 | # dshap = DShap(X=X_data,
71 | #               y=y_data,
72 | #               X_test=X_test_data,
73 | #               y_test=y_test_data,
74 | #               num_test=x//10,
75 | #               model_family='NN',
76 | #               nodump=True)
77 | # dshap.model.fit(X_data, y_data)
78 | # print("Modified model training accuracy for benign data: %g" % dshap.model.score(X_benign, y_benign))
79 | # print("Modified model training accuracy for flipped data: %g" % dshap.model.score(X_flip, y_flip))
80 | 
81 | dshap = DShap(X=X_data,
82 |               y=y_data,
83 |               X_test=X_test_data,
84 |               y_test=y_test_data,
85 |               num_test=x//10,
86 |               model_family='NN')
87 | dshap.run(save_every=10, err = 0.5)
88 | print(y_data - dshap.y)


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/Watermark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | from Shapley import ShapNN
 6 | from DShap import DShap
 7 | from PIL import Image
 8 | import pickle
 9 | import argparse
10 | import copy
11 | import torch
12 | import torch.nn.functional as F
13 | import matplotlib
14 | matplotlib.use('TkAgg')
15 | import matplotlib.pyplot as plt
16 | 
17 | parser = argparse.ArgumentParser(description=None)
18 | parser.add_argument('--num', default=100, type=int)
19 | args = parser.parse_args()
20 | 
21 | x = args.num
22 | 
23 | data = pickle.load(open("./SVHN_data/data.pkl", "rb"))
24 | X_data = data["X_train"].astype('float32')
25 | y_data = data["y_train"].astype('int64')
26 | X_test_data = data["X_test"].astype('float32')
27 | y_test_data = data["y_test"].astype('int64')
28 | 
29 | X_data = np.array(X_data)[0:x]
30 | y_data = y_data[0:x]
31 | X_data_orig = copy.deepcopy(X_data)
32 | y_data_orig = copy.deepcopy(y_data)
33 | X_test_data = np.array(X_test_data)[0:x//10]
34 | y_test_data = y_test_data[0:x//10]
35 | 
36 | X_benign = []
37 | y_benign = []
38 | 
39 | X_poison = []
40 | y_poison = []
41 | watermarked = np.zeros(x)
42 | with open('./CIFAR_data/watermarked_labels.txt','r') as f:
43 |     for i, line in zip(range(100), f):
44 |         j = np.random.randint(x)
45 |         while watermarked[j] == 1:
46 |             j = np.random.randint(x)
47 |         watermarked[j] = 1
48 |         img = np.asarray(Image.open("./CIFAR_data/trigger_set/%d.jpg" % (i + 1)).convert('RGB').resize((32, 32))).transpose(2, 0, 1)
49 |         lbl = int(float(line.strip('\n')))
50 |         X_poison.append(img)
51 |         y_poison.append(lbl)
52 |         X_data[j] = img
53 |         y_data[j] = lbl
54 | 
55 | for i in range(x):
56 |     if watermarked[i] == 0:
57 |         X_benign.append(X_data[i])
58 |         y_benign.append(y_data[i])
59 | pickle.dump(watermarked, open("watermarked.pkl", "wb"))
60 | 
61 | 
62 | dshap = DShap(X=X_data_orig,
63 |               y=y_data_orig,
64 |               X_test=X_test_data,
65 |               y_test=y_test_data,
66 |               num_test=x//10,
67 |               model_family='ResNet',
68 |               nodump=True)
69 | dshap.model.fit(X_data_orig, y_data_orig)
70 | print("Original model training accuracy for benign data: %g" % dshap.model.score(X_data_orig, y_data_orig))
71 | dshap = DShap(X=X_data,
72 |               y=y_data,
73 |               X_test=X_test_data,
74 |               y_test=y_test_data,
75 |               num_test=x//10,
76 |               model_family='ResNet',
77 |               num_classes=10,
78 |               nodump=True)
79 | dshap.model.fit(X_data, y_data)
80 | print("Modified model training accuracy for benign data: %g" % dshap.model.score(X_data_orig, y_data_orig))
81 | print("Modified model training accuracy for poisoned data: %g" % dshap.model.score(X_poison, y_poison))
82 | 
83 | dshap = DShap(X=X_data,
84 |               y=y_data,
85 |               X_test=X_test_data,
86 |               y_test=y_test_data,
87 |               num_test=x//10,
88 |               num_classes=10,
89 |               model_family='ResNet')
90 | dshap.run(save_every=10, err = 0.5)


--------------------------------------------------------------------------------
/use_case/DataAcquisition/uci_knn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | import sklearn
 7 | from Shapley import ShapNN
 8 | from DShap_run import DShap
 9 | from shap_utils import *
10 | from utils import *
11 | import pickle
12 | 
13 | path = "./exp_data/DS_uci/"
14 | with open(path+'data.pkl', 'rb') as f:
15 |     data = pickle.load(f)
16 |     x_train = data["x_train"]
17 |     y_train = data["y_train"]
18 |     x_test = data["x_test"]
19 |     y_test = data["y_test"]
20 |     x_heldout = data["x_heldout"]
21 |     y_heldout = data["y_heldout"]
22 | from models.uci import *
23 | from utils import *
24 | #data preparation
25 | batch_size = 1024
26 | epochs = 30
27 | 
28 | x_train = torch.from_numpy(x_train).contiguous().view(-1, 254)
29 | y_train = torch.from_numpy(y_train).view(-1,).long()
30 | print("train_size:", x_train.shape)
31 | x_test = torch.from_numpy(x_test).contiguous().view(-1, 254)
32 | y_test = torch.from_numpy(y_test).view(-1,).long()
33 | print("test_size:", x_test.shape)
34 | x_heldout = torch.from_numpy(x_heldout).contiguous().view(-1, 254)
35 | y_heldout = torch.from_numpy(y_heldout).view(-1,).long()
36 | print("heldout_size:", x_heldout.shape)
37 | 
38 | 
39 | device = torch.device('cuda')
40 | uci = UCI().to(device)
41 | optimizer = optim.Adam(uci.parameters())
42 | criterion = nn.CrossEntropyLoss()
43 | 
44 | # print(y_train.shape)
45 | train(uci, device, x_train, y_train, batch_size, optimizer, criterion, epochs)
46 | accuracy, avg_loss = evaluate(uci, device, x_train, y_train, batch_size, criterion)
47 | print(f'[Train] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}')
48 | accuracy, avg_loss = evaluate(uci, device, x_heldout, y_heldout, batch_size, criterion)
49 | print(f'[Test] Accuracy: {100 * accuracy:5.2f}%, loss: {avg_loss:7.4f}')
50 | 
51 | 
52 |     
53 | deep_f = []
54 | targets = []
55 | x_deep = torch.cat((x_train, x_test), 0)
56 | y_deep = torch.cat((y_train, y_test), 0)
57 | for X, y in batch(x_deep, y_deep, batch_size):
58 |     X = X.to(device).float()
59 |     fc3, y_pre = uci(X)
60 |     deep_f.append(fc3.view(fc3.size(0), -1).cpu().detach().numpy())
61 | #     targets.append(y.numpy())
62 | 
63 | deep_f = np.concatenate(deep_f) # deep features are not normalized
64 | # targets = np.concatenate(targets)
65 | print(deep_f.shape)
66 | 
67 | import math
68 | kmin = 5
69 | kmax = 6
70 | kinterval = 5
71 | fc1_knn_values = [[] for _ in range(math.ceil((kmax-kmin)/kinterval))] # deep features
72 | loo_fc1_knn_values = [[] for _ in range(math.ceil((kmax-kmin)/kinterval))] # deep features
73 | 
74 | for i, k in enumerate(range(kmin, kmax, kinterval)):
75 |     print("neighbour number:", k)
76 |     fc1_knn_values[i],*_ = old_knn_shapley(k, deep_f[:x_train.shape[0]], deep_f[x_train.shape[0]:], 
77 |                                   y_deep[:x_train.shape[0]], y_deep[x_train.shape[0]:])
78 |     loo_fc1_knn_values[i],*_ = loo_knn_shapley(k, deep_f[:x_train.shape[0]], deep_f[x_train.shape[0]:], 
79 |                                   y_deep[:x_train.shape[0]], y_deep[x_train.shape[0]:])    
80 | 
81 | import pickle
82 | store_data = './exp_data/DS_uci/'
83 | f = open(store_data+'knn.pkl', 'wb')
84 | data_write = {"knn_values": fc1_knn_values, "loo_fc1_knn_values": loo_fc1_knn_values}
85 | pickle.dump(data_write, f)
86 | f.close() 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/PlotAccuracy.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import os
 5 | import seaborn as sns
 6 | 
 7 | x = [0, 10, 20, 30, 40, 50, 60, 70]
 8 | 
 9 | # mnist 5000:
10 | knn = [0.9955555555555555, 0.9515555555555556, 0.9282222222222222, 0.9071111111111111, 0.8886666666666667, 0.8673333333333333, 0.8546666666666667, 0.83]
11 | koo = [0.9955555555555555, 0.9846666666666667, 0.974, 0.9591111111111111, 0.9491111111111111, 0.9335555555555556, 0.922, 0.8888888888888888]
12 | ran = [0.9955555555555555, 0.9868888888888889, 0.9762222222222222, 0.9671111111111111, 0.9531111111111111, 0.934, 0.9208888888888889, 0.8808888888888889]
13 | 
14 | # knn = [0.972, 0.726, 0.164, 0.02, 0.016, 0.016, 0.012, 0.024]
15 | # koo = [0.972, 0.942, 0.904, 0.88, 0.826, 0.752, 0.67, 0.462]
16 | # ran = [0.972, 0.942, 0.916, 0.88, 0.842, 0.724, 0.664, 0.472]
17 | 
18 | 
19 | # fashion 1000:
20 | # knn = [0.9857142857142858, 0.9373626373626374, 0.8736263736263736, 0.856043956043956, 0.8340659340659341, 0.8142857142857143, 0.8208791208791208, 0.8296703296703297]
21 | # koo = [0.9857142857142858, 0.9703296703296703, 0.9483516483516483, 0.9384615384615385, 0.9131868131868132, 0.9076923076923077, 0.8769230769230769, 0.8670329670329671]
22 | # loo = [0.9857142857142858, 0.9659340659340659, 0.9538461538461539, 0.9362637362637363, 0.9098901098901099, 0.9, 0.865934065934066, 0.8505494505494505]
23 | # tmc = [0.9857142857142858, 0.945054945054945, 0.9120879120879121, 0.8604395604395605, 0.8241758241758241, 0.8120879120879121, 0.8, 0.7945054945054945]
24 | # g   = [0.9857142857142858, 0.9131868131868132, 0.8263736263736263, 0.7791208791208791, 0.7571428571428571, 0.6494505494505495, 0.4868131868131868, 0.4868131868131868]
25 | # ran = [0.9857142857142858, 0.9758241758241758, 0.9582417582417583, 0.9428571428571428, 0.9197802197802197, 0.9065934065934066, 0.8901098901098901, 0.8428571428571429]
26 | 
27 | # knn = [0.9888888888888889, 0.6555555555555556, 0.4, 0.16666666666666666, 0.06666666666666667, 0.1111111111111111, 0.1, 0.1]
28 | # koo = [0.9888888888888889, 0.9555555555555556, 0.9111111111111111, 0.9555555555555556, 0.9222222222222223, 0.8555555555555555, 0.8222222222222222, 0.6777777777777778]
29 | # loo = [0.9888888888888889, 0.9888888888888889, 1.0, 0.9888888888888889, 0.9666666666666667, 0.9111111111111111, 0.6777777777777778, 0.5555555555555556]
30 | # tmc = [0.9888888888888889, 0.7111111111111111, 0.3333333333333333, 0.23333333333333334, 0.13333333333333333, 0.13333333333333333, 0.14444444444444443, 0.17777777777777778]
31 | # g   = [0.9888888888888889, 0.7888888888888889, 0.5777777777777777, 0.5444444444444444, 0.5444444444444444, 0.5222222222222223, 0.5666666666666667, 0.5666666666666667]
32 | # ran = [0.9888888888888889, 0.9666666666666667, 0.9666666666666667, 0.9, 0.8333333333333334, 0.8, 0.5444444444444444, 0.4888888888888889]
33 | 
34 | 
35 | # Pubfig 1000:
36 | # knn = [1, 0.9322222222222222, 0.8555555555555555, 0.7833333333333333, 0.7044444444444444, 0.5844444444444444, 0.49444444444444446, 0.43444444444444447]
37 | # koo = [1.0, 0.9366666666666666, 0.8922222222222222, 0.8388888888888889, 0.7822222222222223, 0.6777777777777778, 0.5966666666666667, 0.49333333333333335]
38 | # ran = [1, 0.9488888888888889, 0.8988888888888888, 0.8333333333333334, 0.7833333333333333, 0.6944444444444444, 0.5755555555555556, 0.49333333333333335]
39 | 
40 | # knn = [1.0, 0.97, 0.99, 0.91, 0.82, 0.55, 0.3, 0.1]
41 | # koo = [1.0, 1.0, 0.94, 0.96, 0.97, 0.96, 0.95, 0.89]
42 | # ran = [1.0, 1.0, 0.96, 0.94, 0.91, 0.82, 0.82, 0.75]
43 | 
44 | plt.plot(x, np.array(knn) * 100, 'o-', color = 'purple', label = 'KNN-Shapley')
45 | plt.plot(x, np.array(koo) * 100, 'o-', color='violet', label = 'KNN-LOO')
46 | # plt.plot(x, np.array(loo) * 100, '^-', color = 'olive', label = "Leave-One-Out")
47 | # plt.plot(x, np.array(tmc) * 100, 's-', color = 'blue', label = "TMC-Shapley")
48 | # plt.plot(x, np.array(g) * 100, 's-', color = 'orange', label = "G-Shapley")
49 | plt.plot(x, np.array(ran) * 100, '--', color='red', label = "Random")
50 | 
51 | plt.xlabel('Fraction of data removed (%)')
52 | plt.ylabel('Model accuracy for benign data (%)')
53 | plt.legend(loc='lower left')
54 | plt.show()


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/PlotLabel.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | 
  7 | sns.set()
  8 | flip = pickle.load(open("flip.pkl", "rb"), encoding = "iso-8859-1")
  9 | 
 10 | # loo_v = pickle.load(open("loo.pkl", "rb"), encoding = "iso-8859-1")["loo"]
 11 | # loo_i = np.argsort(-loo_v)[::-1]
 12 | # cnt = 0
 13 | # f = []
 14 | # total = 0
 15 | # cnt = 0
 16 | # for i in range(len(loo_i)):
 17 | #     if flip[int(loo_i[i])] == 1:
 18 | #         total += 1
 19 | # for i in range(len(loo_i)):
 20 | #     if flip[int(loo_i[i])] == 1:
 21 | #         cnt += 1
 22 | #     f.append(1.0 * cnt / total)
 23 | # x = np.array(range(1, len(loo_i) + 1)) / len(loo_i) * 100
 24 | # x = np.append(x[0:-1:10], x[-1])
 25 | # f = np.append(f[0:-1:10], f[-1])
 26 | # plt.plot(x, np.array(f) * 100, '^-', color = 'olive', label = "Leave-One-Out", zorder=4, alpha=0.8)
 27 | 
 28 | # tmc_v = pickle.load(open("tmc.pkl", "rb"), encoding = "iso-8859-1")
 29 | # tmc_i = np.argsort(-tmc_v)[::-1]
 30 | # cnt = 0
 31 | # f = []
 32 | # total = 0
 33 | # cnt = 0
 34 | # for i in range(len(tmc_i)):
 35 | #     if flip[int(tmc_i[i])] == 1:
 36 | #         total += 1
 37 | # for i in range(len(tmc_i)):
 38 | #     if flip[int(tmc_i[i])] == 1:
 39 | #         cnt += 1
 40 | #     f.append(1.0 * cnt / total)
 41 | # x = np.array(range(1, len(tmc_i) + 1)) / len(tmc_i) * 100
 42 | # x = np.append(x[0:-1:10], x[-1])
 43 | # f = np.append(f[0:-1:10], f[-1])
 44 | # plt.plot(x, np.array(f) * 100, 's-', color = 'blue', label = "TMC-Shapley")
 45 | 
 46 | # # Only LogisticRegression and NN model have G-Shapley metrics
 47 | # g_v = pickle.load(open("g.pkl", "rb"), encoding = "iso-8859-1")
 48 | # g_i = np.argsort(-g_v)[::-1]
 49 | # cnt = 0
 50 | # f = []
 51 | # total = 0
 52 | # cnt = 0
 53 | # for i in range(len(g_i)):
 54 | #     if flip[int(g_i[i])] == 1:
 55 | #         total += 1
 56 | # for i in range(len(g_i)):
 57 | #     if flip[int(g_i[i])] == 1:
 58 | #         cnt += 1
 59 | #     f.append(1.0 * cnt / total)
 60 | # x = np.array(range(1, len(g_i) + 1)) / len(g_i) * 100
 61 | # x = np.append(x[0:-1:10], x[-1])
 62 | # f = np.append(f[0:-1:10], f[-1])
 63 | # plt.plot(x, np.array(f) * 100, 's-', color = 'orange', label = "G-Shapley", zorder=5)
 64 | 
 65 | # for K in range(10, 11):
 66 | #     knn_v = pickle.load(open('looknn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 67 | #     knn_i = np.argsort(-knn_v)[::-1]
 68 | #     cnt = 0
 69 | #     f = []
 70 | #     total = 0
 71 | #     cnt = 0
 72 | #     for i in range(len(knn_i)):
 73 | #         if flip[int(knn_i[i])] == 1:
 74 | #             total += 1
 75 | #     for i in range(len(knn_i)):
 76 | #         if flip[int(knn_i[i])] == 1:
 77 | #             cnt += 1
 78 | #         f.append(1.0 * cnt / total)
 79 | #     x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
 80 | #     x = np.append(x[0:-1:10], x[-1])
 81 | #     f = np.append(f[0:-1:10], f[-1])
 82 | #     plt.plot(x, np.array(f) * 100, 'o-', color='violet', label = 'KNN-LOO-Shapley'.format(K), zorder=6, alpha=0.8)
 83 | 
 84 | colors = ["#E6CAFF", "#DCB5FF", "#d3a4ff", "#CA8EFF", "#BE77FF", "#B15BFF", "#9F35FF", "#921AFF"]
 85 | for K in range(10, 11):
 86 |     knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 87 |     knn_v = np.mean(knn_v, axis=1)
 88 |     knn_i = np.argsort(-knn_v)[::-1]
 89 |     cnt = 0
 90 |     f = []
 91 |     total = 0
 92 |     cnt = 0
 93 |     for i in range(len(knn_i)):
 94 |         if flip[int(knn_i[i])] == 1:
 95 |             total += 1
 96 |     for i in range(len(knn_i)):
 97 |         if flip[int(knn_i[i])] == 1:
 98 |             cnt += 1
 99 |         f.append(1.0 * cnt / total)
100 |     x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
101 |     x = np.append(x[0:-1:10], x[-1])
102 |     f = np.append(f[0:-1:10], f[-1])
103 |     plt.plot(x, np.array(f) * 100, 'o-', color='purple', label = 'KNN-Shapley'.format(K), linewidth=3)
104 | 
105 | ran_v = np.random.rand(len(knn_v))
106 | ran_i = np.argsort(-ran_v)[::-1]
107 | cnt = 0
108 | f = []
109 | total = 0
110 | cnt = 0
111 | for i in range(len(ran_i)):
112 |     if flip[int(ran_i[i])] == 1:
113 |         total += 1
114 | for i in range(len(ran_i)):
115 |     if flip[int(ran_i[i])] == 1:
116 |         cnt += 1
117 |     f.append(1.0 * cnt / total)
118 | x = np.array(range(1, len(ran_i) + 1)) / len(ran_i) * 100
119 | f = x / 100
120 | plt.plot(x, np.array(f) * 100, '--', color='red', label = "Random", zorder=7)
121 | 
122 | 
123 | 
124 | plt.xlabel('Fraction of data inspected (%)')
125 | plt.ylabel('Fraction of incorrect labels (%)')
126 | plt.legend(loc='lower right')
127 | plt.show()


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/Poisoning.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | from Shapley import ShapNN
  6 | from DShap import DShap
  7 | from tensorflow.examples.tutorials.mnist import input_data
  8 | from PIL import Image
  9 | import pickle
 10 | import argparse
 11 | import os
 12 | import copy
 13 | from pubfig_data import PUBFIG83
 14 | 
 15 | parser = argparse.ArgumentParser(description=None)
 16 | parser.add_argument('--num', default=100, type=int)
 17 | args = parser.parse_args()
 18 | 
 19 | x = args.num
 20 | 
 21 | # pubfig = PUBFIG83(root='./pubfig_data/pubfig83-aligned')
 22 | # imgs = pubfig.imgs
 23 | # X_data = []
 24 | # y_data = []
 25 | # for i in range(len(imgs)):
 26 | #     if imgs[i][1] >= 10:
 27 | #         continue
 28 | #     X_data.append(np.asarray(Image.open(imgs[i][0]).resize((32, 32))).astype("float32").transpose(2, 0, 1))
 29 | #     y_data.append(imgs[i][1])
 30 | # X_data = np.array(X_data)
 31 | # y_data = np.array(y_data)
 32 | 
 33 | # state = np.random.get_state()
 34 | # pickle.dump(state, open('state.pkl', 'wb'))
 35 | # np.random.shuffle(X_data)
 36 | # np.random.set_state(state)
 37 | # np.random.shuffle(y_data)
 38 | 
 39 | # X_test_data = X_data[x:x+x//10]
 40 | # y_test_data = y_data[x:x+x//10]
 41 | # X_data = X_data[0:x]
 42 | # y_data = y_data[0:x]
 43 | # X_data_orig = copy.deepcopy(X_data)
 44 | # y_data_orig = copy.deepcopy(y_data)
 45 | 
 46 | # X_benign = []
 47 | # y_benign = []
 48 | 
 49 | # X_poison = []
 50 | # y_poison = []
 51 | 
 52 | # watermarked = np.zeros(x)
 53 | # filenames = os.listdir('./pubfig_data/watermarked')
 54 | # filenames.sort(key=lambda x:int(x[:-4]))
 55 | # with open('./pubfig_data/watermarked_labels.txt','r') as f:
 56 | #     for filename, line in zip(filenames, f):
 57 | #         num = np.random.randint(0, x)
 58 | #         while watermarked[num] == 1:
 59 | #             num = np.random.randint(0, x)
 60 | #         watermarked[num] = 1
 61 | #         img = np.asarray(Image.open("./pubfig_data/watermarked/" + filename).resize((32, 32))).astype("float32").transpose(2, 0, 1)
 62 | #         lbl = int(float(line.strip('\n'))) % 10
 63 | #         X_data[num] = img
 64 | #         y_data[num] = lbl
 65 | #         X_poison.append(img)
 66 | #         y_poison.append(lbl)
 67 | # for i in range(x):
 68 | #     if watermarked[i] == 0:
 69 | #         X_benign.append(X_data[i])
 70 | #         y_benign.append(y_data[i])
 71 | # pickle.dump(watermarked, open('watermarked.pkl', 'wb'))
 72 | 
 73 | # dshap = DShap(X=X_data,
 74 | #               y=y_data,
 75 | #               X_test=X_test_data,
 76 | #               y_test=y_test_data,
 77 | #               num_test=x//10,
 78 | #               model_family='ResNet',
 79 | #               num_classes=10,
 80 | #               nodump=True)
 81 | # dshap.run(save_every=10, err = 0.5)
 82 | 
 83 | # pickle.dump(X_data, open("X_data.pkl", "wb"))
 84 | # pickle.dump(y_data, open("y_data.pkl", "wb"))
 85 | # pickle.dump(X_test_data, open("X_test_data.pkl", "wb"))
 86 | # pickle.dump(y_test_data, open("y_test_data.pkl", "wb"))
 87 | # pickle.dump(X_benign, open("X_benign.pkl", "wb"))
 88 | # pickle.dump(y_benign, open("y_benign.pkl", "wb"))
 89 | # pickle.dump(X_poison, open("X_poison.pkl", "wb"))
 90 | # pickle.dump(y_poison, open("y_poison.pkl", "wb"))
 91 | 
 92 | X_data = pickle.load(open("X_data.pkl", "rb"))
 93 | y_data = pickle.load(open("y_data.pkl", "rb"))
 94 | X_test_data = pickle.load(open("X_test_data.pkl", "rb"))
 95 | y_test_data = pickle.load(open("y_test_data.pkl", "rb"))
 96 | X_benign = pickle.load(open("X_benign.pkl", "rb"))
 97 | y_benign = pickle.load(open("y_benign.pkl", "rb"))
 98 | X_poison = pickle.load(open("X_poison.pkl", "rb"))
 99 | y_poison = pickle.load(open("y_poison.pkl", "rb"))
100 | 
101 | knn_v = pickle.load(open('looknn_10.pkl', 'rb'), encoding = "iso-8859-1")
102 | # knn_v = np.mean(knn_v, axis=1)
103 | knn_i = np.argsort(knn_v)
104 | 
105 | benign_acc = []
106 | backdoor_acc = []
107 | 
108 | for frac in range(0, 8):
109 |     X_new = []
110 |     y_new = []
111 |     for i in range(len(knn_i)):
112 |         if i < len(knn_i) * 0.1 * frac:
113 |              continue
114 |         X_new.append(X_data[knn_i[i]])
115 |         y_new.append(y_data[knn_i[i]])
116 |     dshap = DShap(X=np.array(X_new),
117 |                   y=np.array(y_new),
118 |                   X_test=X_test_data,
119 |                   y_test=y_test_data,
120 |                   num_test=x//10,
121 |                   model_family='ResNet',
122 |                   num_classes=10,
123 |                   nodump=True)
124 |     dshap.model.fit(np.array(X_new), np.array(y_new))
125 |     bn = dshap.model.score(X_benign, y_benign)
126 |     bd = dshap.model.score(X_poison, y_poison)
127 |     benign_acc.append(bn)
128 |     backdoor_acc.append(bd)
129 |     print("Benign {}: {}".format(10*frac, bn))
130 |     print("Backdoor {}: {}".format(10*frac, bd))
131 | 
132 | print(benign_acc)
133 | print(backdoor_acc)


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/runtime.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | import sklearn
  7 | from Shapley import ShapNN
  8 | from DShap import DShap
  9 | from shap_utils import *
 10 | from utils import *
 11 | import sklearn
 12 | import pickle
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | import torch.optim as optim
 17 | from tensorflow.examples.tutorials.mnist import input_data
 18 | 
 19 | MEM_DIR = './'
 20 | directory = './temp_runtime'
 21 | store_data = './temp_runtime/data/'
 22 | try:
 23 |     os.stat(directory)
 24 | except:
 25 |     os.mkdir(directory)
 26 | try:
 27 |     os.stat(store_data)
 28 | except:
 29 |     os.mkdir(store_data)  
 30 |     
 31 | train_size = [10, 100, 200, 400, 800, 1000, 5000]
 32 | time_knn = []
 33 | time_tmc = []
 34 | time_loo = []
 35 | time_g = []
 36 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)  
 37 | 
 38 | 
 39 | # knn shapley Hyperparameters
 40 | batch_size = 1024
 41 | epochs = 30
 42 | k = 5
 43 | model = "ResNet"
 44 | 
 45 | def load_CIFAR_batch(filename):
 46 |     with open(filename, 'rb') as f:
 47 |         datadict = pickle.load(f, encoding='latin1')
 48 |         X = datadict['data']
 49 |         Y = datadict['labels']
 50 |         X = X.reshape(10000, 3072).astype("float32") / 255
 51 |         Y = np.array(Y)
 52 |         return X, Y
 53 | xs = []
 54 | ys = []
 55 | for b in range(1, 6):
 56 |     f = './CIFAR_data/data_batch_%d' % b
 57 |     X, Y = load_CIFAR_batch(f)
 58 |     xs.append(X)
 59 |     ys.append(Y)
 60 | X_data = np.concatenate(xs)
 61 | y_data = np.concatenate(ys)
 62 | X_test_data, y_test_data = load_CIFAR_batch('./CIFAR_data/test_batch')
 63 | X_test_data = np.array(X_test_data)
 64 | y_test_data = np.array(y_test_data)
 65 | 
 66 | 
 67 | for size in train_size:
 68 |     print('size:', size)
 69 | #     print('---1. calculate knn run time')
 70 |     num_test = size
 71 |     x_tr = X_data[0:size].astype("float32")
 72 |     y_tr = y_data[0:size].astype("int64")
 73 |     x_te = X_test_data[0:size].astype("float32")
 74 |     y_te = y_test_data[0:size].astype("int64")
 75 | #     start_time = time.time()
 76 | #     dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy',
 77 | #               directory=directory, seed=0, nodump=True)
 78 | #     dshap.run(10, 0.5, knn_run=True, g_run=False, loo_run=False, tmc_run=False)
 79 | #     time_knn.append(str((time.time() - start_time)/60.0))
 80 | # #     print("--- %s minutes ---" % ((time.time() - start_time)/60.0))
 81 | # #     
 82 | #     print('knn time:', time_knn)
 83 | #     f = open(store_data+'knn_time.pkl', 'wb')
 84 | #     data = {'knn_runtime': time_knn, 'train_size': train_size}
 85 | #     pickle.dump(data, f)
 86 | #     f.close() 
 87 |     print('---2. calculate g shapley run time')
 88 | 
 89 |     start_time = time.time()
 90 |     dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy',
 91 |               directory=directory, seed=0, nodump=True)
 92 |     dshap.run(10, 0.5, knn_run=False, g_run=True, loo_run=False, tmc_run=False)
 93 |     time_g.append(str((time.time() - start_time)/60.0))
 94 | #     print("--- %s minutes ---" % ((time.time() - start_time)/60.0))
 95 |     
 96 |     print('time g:', time_g)
 97 |     f = open(store_data+'g_time.pkl', 'wb')
 98 |     data = {'g_runtime': time_g, 'train_size': train_size}
 99 |     pickle.dump(data, f)
100 |     f.close()
101 | 
102 | #     print('---3. calculate loo run time')
103 | 
104 | #     start_time = time.time()
105 | #     dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy',
106 | #               directory=directory, seed=0, nodump=True)
107 | #     dshap.run(10, 0.5, knn_run=False, g_run=False, loo_run=True, tmc_run=False)
108 | #     time_loo.append(str((time.time() - start_time)/60.0))
109 | # #     print("--- %s minutes ---" % ((time.time() - start_time)/60.0))
110 |     
111 | #     print('time loo:', time_loo)
112 | #     f = open(store_data+'loo_time.pkl', 'wb')
113 | #     data = {'loo_runtime': time_loo, 'train_size': train_size}
114 | #     pickle.dump(data, f)
115 | #     f.close()
116 | 
117 | #     print('---4. calculate tmc run time')
118 | 
119 | #     start_time = time.time()
120 | #     dshap = DShap(x_tr, y_tr, x_te, y_te, num_test, sources=None, model_family=model, metric='accuracy',
121 | #               directory=directory, seed=0, nodump=True)
122 | #     dshap.run(10, 0.5, knn_run=False, g_run=False, loo_run=False, tmc_run=True)
123 | #     time_tmc.append(str((time.time() - start_time)/60.0))
124 | # #     print("--- %s minutes ---" % ((time.time() - start_time)/60.0))
125 |     
126 | #     print('time tmc:', time_tmc)
127 | #     f = open(store_data+'tmc_time.pkl', 'wb')
128 | #     data = {'tmc_runtime': time_tmc, 'train_size': train_size}
129 | #     pickle.dump(data, f)
130 | #     f.close()


--------------------------------------------------------------------------------
/LSH_sp_example.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from math import ceil
  3 | 
  4 | import numpy as np
  5 | from sklearn.utils import shuffle
  6 | 
  7 | from LSH_sp import get_contrast, find_best_r_normalize, g_normalize, f_h, LSH
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | data = np.load('CIFAR10_resnet50-keras_features.npz')
 11 | x_trn = np.vstack((data['features_training'], data['features_testing']))
 12 | y_trn = np.hstack((data['labels_training'], data['labels_testing']))
 13 | 
 14 | x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0)
 15 | 
 16 | x_trn = np.reshape(x_trn, (-1, 2048))
 17 | x_tst, y_tst = x_trn[:100], y_trn[:100]
 18 | x_val, y_val = x_trn[100:1100], y_trn[100:1100]
 19 | x_trn, y_trn = x_trn[1100:], y_trn[1100:]
 20 | 
 21 | # we are using 1-nn classifier
 22 | K = 1
 23 | eps = 0.1
 24 | 
 25 | K_star = max(K, ceil(1 / eps))
 26 | get_contrast(x_val)
 27 | dist_rand = np.load('eps0.1/dist_rand.npy')
 28 | contrast = np.load('eps0.1/contrast.npy')
 29 | dist_knn = np.load('eps0.1/dist_knn.npy')
 30 | 
 31 | dist_rand = np.mean(dist_rand, axis=0)
 32 | contrast = np.mean(contrast, axis=0)[K_star - 1]
 33 | dist_knn = np.mean(dist_knn, axis=0)[K_star - 1]
 34 | 
 35 | search_range = np.arange(1e-3, 10, 1e-3)
 36 | r_vec_normalize = find_best_r_normalize(search_range, contrast)
 37 | g_vec = g_normalize(contrast, r_vec_normalize)
 38 | 
 39 | # plot g(C_K) vs r, we want g(C_k) to be small
 40 | # search range, find r that minimize g, shape should be similar to convex
 41 | g = g_normalize(contrast, search_range)
 42 | plt.figure()
 43 | plt.plot(search_range, g)
 44 | plt.show()
 45 | 
 46 | np.save('eps0.1/selected_param_r_' + str(K_star) + '.npy', r_vec_normalize)
 47 | np.save('eps0.1/selected_param_g_' + str(K_star) + '.npy', g_vec)
 48 | 
 49 | 
 50 | def equal(a, b):
 51 |     return int(a == b)
 52 | 
 53 | 
 54 | def fine_tune_val(n_hash_table=10, alpha=0.5, file=False, val_sp_gt=None):
 55 |     t = r_vec_normalize
 56 |     n_trn = len(x_trn)
 57 |     n_hash_bit = int(np.ceil(np.log(n_trn) * alpha / np.log(1 / f_h(1, t))))
 58 |     if file is True:
 59 |         print(n_hash_bit, file=open('eps0.1/log.txt', 'a'))
 60 |     else:
 61 |         print(n_hash_bit)
 62 | 
 63 |     start = time.time()
 64 |     lsh = LSH(n_hash_bit=n_hash_bit, n_hash_table=n_hash_table, x_trn=x_trn, y_trn=y_trn, dist_rand=dist_rand,
 65 |               equal=equal, t=t)
 66 |     runtime_build_hash = time.time() - start
 67 |     if file is True:
 68 |         print(runtime_build_hash, file=open('eps0.1/log.txt', 'a'))
 69 |     else:
 70 |         print(runtime_build_hash)
 71 | 
 72 |     start = time.time()
 73 |     x_val_knn_approx, nns_vec = lsh.get_approx_KNN(x_val, K_star)
 74 |     runtime_query = time.time() - start
 75 |     if file is True:
 76 |         print(runtime_query, file=open('eps0.1/log.txt', 'a'))
 77 |     else:
 78 |         print(runtime_query)
 79 | 
 80 |     start = time.time()
 81 |     sp_approx = lsh.compute_approx_shapley(x_val_knn_approx, y_val, K)
 82 |     runtime_approx_value = time.time() - start
 83 |     if file is True:
 84 |         print('it takes %s to get appox knn value' % runtime_approx_value, file=open('eps0.1/log.txt', 'a'))
 85 |     else:
 86 |         print('it takes %s to get appox knn value' % runtime_approx_value)
 87 | 
 88 |     if val_sp_gt is not None:
 89 |         sp_err_inf_val = np.linalg.norm(val_sp_gt - sp_approx, ord=np.inf, axis=1)
 90 |         if file is True:
 91 |             print('max error %s' % np.percentile(sp_err_inf_val, 90), file=open('eps0.1/log.txt', 'a'))
 92 |         else:
 93 |             print('max error %s' % np.percentile(sp_err_inf_val, 90))
 94 |     return lsh
 95 | 
 96 | 
 97 | def fine_tune_test(lsh=None, file=False, sp_gt=None):
 98 |     start = time.time()
 99 |     x_tst_knn_approx, nns_vec = lsh.get_approx_KNN(x_tst, K_star)
100 |     runtime_query = time.time() - start
101 |     if file is True:
102 |         print(runtime_query, file=open('eps0.1/log.txt', 'a'))
103 |     else:
104 |         print(runtime_query)
105 | 
106 |     start = time.time()
107 |     sp_approx = lsh.compute_approx_shapley(x_tst_knn_approx, y_tst, K)
108 |     runtime_approx_value = time.time() - start
109 |     if file is True:
110 |         print('it takes %s to get appox knn value' % runtime_approx_value, file=open('eps0.1/log.txt', 'a'))
111 |     else:
112 |         print('it takes %s to get appox knn value' % runtime_approx_value)
113 | 
114 |     if sp_gt is not None:
115 |         sp_err_inf_val = np.linalg.norm(sp_gt - sp_approx, ord=np.inf, axis=1)
116 |         if file is True:
117 |             print('max error %s' % np.percentile(sp_err_inf_val, 90), file=open('eps0.1/log.txt', 'a'))
118 |         else:
119 |             print('max error %s' % np.percentile(sp_err_inf_val, 90))
120 |     return sp_approx, nns_vec
121 | 
122 | 
123 | val_sp_gt = np.load('val_exact_sp_gt.npy')
124 | tst_sp_gt = np.load('tst_exact_sp_gt.npy')
125 | lsh_82_05 = fine_tune_val(82, 0.5, val_sp_gt=val_sp_gt)
126 | sp_approx_82_05, nns_vec_82_05 = fine_tune_test(lsh=lsh_82_05, sp_gt=tst_sp_gt)
127 | 
128 | np.save('eps0.1/sp_approx_05', sp_approx_82_05)
129 | np.save('eps0.1/lsh_82_05', lsh_82_05)
130 | 


--------------------------------------------------------------------------------
/LSH_sp.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | from scipy.stats import norm
  4 | from tqdm import tqdm
  5 | 
  6 | 
  7 | def get_contrast(x_trn, save_dir='eps0.1/'):
  8 |     num_cores = 8
  9 |     mc_num = 5
 10 |     eps = 0.1
 11 |     n_trn = x_trn.shape[0]
 12 |     K = int(1 / eps)
 13 | 
 14 |     def compute_distance(i_q, query, x_trn, n_trn, K):
 15 |         dist_to_random = np.zeros(n_trn)
 16 |         for i_trn in range(n_trn):
 17 |             dist_to_random[i_trn] = np.linalg.norm(query - x_trn[i_trn, :], 2)
 18 |         dist_to_random_avg = np.mean(dist_to_random)
 19 |         dist_to_KNN = np.sort(dist_to_random)[:K]
 20 |         if i_q % 100 == 0:
 21 |             print(i_q)
 22 |         return dist_to_random_avg, dist_to_KNN
 23 | 
 24 |     def estimate_contrast(x_trn, query, K):
 25 |         # estimate empirical contrast
 26 |         n_trn = x_trn.shape[0]
 27 |         n_q = query.shape[0]
 28 |         from joblib import Parallel, delayed
 29 |         result = Parallel(n_jobs=num_cores)(
 30 |             delayed(compute_distance)(i_q, query[i_q, :], x_trn, n_trn, K) for i_q in range(n_q))
 31 |         dist_to_random_avg = np.array([result[i][0] for i in range(n_q)])
 32 |         dist_to_KKN = np.array([result[i][1] for i in range(n_q)])
 33 |         assert dist_to_KKN.shape[0] == n_q
 34 |         dist_to_KNN_avg_q = np.mean(dist_to_KKN, axis=0)
 35 |         dist_to_random_avg_avg = np.mean(dist_to_random_avg)
 36 |         contrast = dist_to_random_avg_avg / dist_to_KNN_avg_q
 37 |         return dist_to_random_avg_avg, dist_to_KNN_avg_q, contrast
 38 | 
 39 |     contrast = []
 40 |     dist_rand = []
 41 |     dist_knn = []
 42 |     for mc_i in range(mc_num):
 43 |         start = time.time()
 44 |         sample_ind_trn = np.random.choice(np.arange(n_trn), int(n_trn / 5 * 4), replace=False).astype(int)
 45 |         sample_ind_query = np.array(
 46 |             list(set(np.arange(n_trn).astype(int).tolist()) - set(sample_ind_trn.tolist()))).astype(int)
 47 |         dist_rand_, dist_knn_, contrast_ = estimate_contrast(x_trn[sample_ind_trn, :], x_trn[sample_ind_query, :], K)
 48 |         dist_rand.append(dist_rand_)
 49 |         dist_knn.append(dist_knn_)
 50 |         contrast.append(contrast_)
 51 | 
 52 |         print('monte carlo iteration%s ' % mc_i)
 53 |         elapsed_time = time.time() - start
 54 |         print('elapsed time is %s' % elapsed_time)
 55 |     dist_knn = np.array(dist_knn)
 56 |     contrast = np.array(contrast)
 57 |     dist_rand = np.array(dist_rand)
 58 |     np.save(save_dir + 'dist_rand', dist_rand)
 59 |     np.save(save_dir + 'dist_knn', dist_knn)
 60 |     np.save(save_dir + 'contrast', contrast)
 61 | 
 62 | 
 63 | def f_h(x, r):
 64 |     y = 1 - 2 * norm.cdf(-r / x) - 2 / (np.sqrt(2 * np.pi) * r / x) * (1 - np.exp(-(r ** 2 / (2 * (x ** 2)))))
 65 |     return y
 66 | 
 67 | 
 68 | def g_unnormalize(dist_rand, dist_knn, r):
 69 |     y = np.log(f_h(dist_knn, r)) / np.log(f_h(dist_rand, r))
 70 |     return y
 71 | 
 72 | 
 73 | def g_normalize(contrast, r):
 74 |     y = np.log(f_h(1 / contrast, r)) / np.log(f_h(1, r))
 75 |     return y
 76 | 
 77 | 
 78 | def find_best_r_normalize(search_range, contrast):
 79 |     y = g_normalize(contrast, search_range)
 80 |     min_ind = np.argmin(y)
 81 |     return search_range[min_ind]
 82 | 
 83 | 
 84 | def find_best_r_unnormalize(search_range, dist_rand, dist_knn):
 85 |     y = g_unnormalize(dist_rand, dist_knn, search_range)
 86 |     min_ind = np.argmin(y)
 87 |     return search_range[min_ind]
 88 | 
 89 | 
 90 | def lsh_function(t, x, w, b):
 91 |     # x is 1-d array
 92 |     h = np.floor((np.dot(w, x) + b) / t).astype(int)
 93 |     return h
 94 | 
 95 | 
 96 | class LSH:
 97 |     def __init__(self, n_hash_bit, n_hash_table, x_trn, y_trn, dist_rand, equal, t=0.1):
 98 |         self.n_hash_bit = n_hash_bit
 99 |         self.n_hash_table = n_hash_table
100 |         self.t = t  # width of projections
101 |         self.dist_rand = dist_rand
102 |         self.x_trn = x_trn
103 |         self.y_trn = y_trn
104 |         self.N, self.dim = x_trn.shape
105 |         self.equal = equal
106 |         # draw w from a normal distribution (2-stable)
107 |         self.w = np.random.normal(0, 1, (n_hash_table, n_hash_bit, self.dim))
108 |         # draw b from U[0,t]
109 |         self.b = np.random.uniform(0, self.t, (n_hash_table, n_hash_bit))
110 |         self.x_trn_hash = [dict() for i in range(n_hash_table)]
111 |         for i in tqdm(range(self.N)):
112 |             hash_code_all = lsh_function(self.t, x_trn[i] / dist_rand, self.w, self.b)
113 |             for l in range(n_hash_table):
114 |                 hash_code_trn = '.'.join(map(str, hash_code_all[l, :]))
115 |                 if hash_code_trn in self.x_trn_hash[l].keys():
116 |                     self.x_trn_hash[l][hash_code_trn].append(i)
117 |                 else:
118 |                     self.x_trn_hash[l][hash_code_trn] = [i]
119 | 
120 |     def get_approx_KNN(self, x_tst, K):
121 |         N_tst = x_tst.shape[0]
122 |         x_tst_knn = np.ones((N_tst, K)) * (-1)
123 |         nns_len = np.zeros(N_tst)
124 |         for i_tst in tqdm(range(N_tst)):
125 |             nns = []
126 |             for l in range(self.n_hash_table):
127 |                 hash_code_int = lsh_function(self.t, x_tst[i_tst] / self.dist_rand, self.w[l, :, :], self.b[l, :])
128 |                 hash_code_test = '.'.join(map(str, hash_code_int))
129 |                 if hash_code_test in self.x_trn_hash[l].keys():
130 |                     nns += self.x_trn_hash[l][hash_code_test]
131 |             nns = np.unique(nns)
132 |             num_collide_elements = len(nns)
133 |             if len(nns) > 0:
134 |                 dist = [np.linalg.norm(self.x_trn[i] / self.dist_rand - x_tst[i_tst] / self.dist_rand, 2) for i in nns]
135 |                 dist_min_ind = nns[np.argsort(dist)]
136 |                 if num_collide_elements < K:
137 |                     x_tst_knn[i_tst, :num_collide_elements] = dist_min_ind[:num_collide_elements]
138 |                 else:
139 |                     x_tst_knn[i_tst, :] = dist_min_ind[:K]
140 |             # pdb.set_trace()
141 |             nns_len[i_tst] = len(nns)
142 |             if i_tst % 100 == 0:
143 |                 print('get approximate knn %s' % i_tst)
144 |         return x_tst_knn.astype(int), nns_len
145 | 
146 |     def compute_approx_shapley(self, x_tst_knn, y_tst, K):
147 |         N_tst, K_star = x_tst_knn.shape
148 |         # flag_sufficient = (x_tst_knn[:,-1]>=0)
149 |         sp_approx = np.zeros((N_tst, self.N))
150 |         for j in tqdm(range(N_tst)):
151 |             non_nan_index = np.where(x_tst_knn[j, :] >= 0)[0]
152 |             if len(non_nan_index) == 0:
153 |                 continue
154 |             K_tot = non_nan_index[-1]
155 |             if K_tot == self.N:
156 |                 sp_approx[j, x_tst_knn[j, self.N - 1]] = self.equal(self.y_trn[x_tst_knn[j, self.N - 1]],
157 |                                                                     y_tst[j]) / self.N
158 |             for i in np.arange(K_tot - 1, -1, -1):
159 |                 sp_approx[j, x_tst_knn[j, i]] = sp_approx[j, x_tst_knn[j, i + 1]] + (
160 |                         self.equal(self.y_trn[x_tst_knn[j, i]], y_tst[j]) - self.equal(
161 |                     self.y_trn[x_tst_knn[j, i + 1]], y_tst[j])) / K * min([K, i + 1]) / (i + 1)
162 | 
163 |         return sp_approx
164 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/pytorch_fitmodule/fit_module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from collections import OrderedDict
  4 | from functools import partial
  5 | from torch.autograd import Variable
  6 | from torch.nn import CrossEntropyLoss, Module
  7 | from torch.optim import SGD
  8 | import numpy
  9 | 
 10 | from .utils import add_metrics_to_log, get_loader, log_to_message, ProgressBar
 11 | 
 12 | 
 13 | DEFAULT_LOSS = CrossEntropyLoss()
 14 | DEFAULT_OPTIMIZER = partial(SGD, lr=0.001, momentum=0.9)
 15 | 
 16 | 
 17 | class FitModule(Module):
 18 | 
 19 |     def eval_hessian(self, loss_grad):
 20 |         cnt = 0
 21 |         for g in loss_grad:
 22 |             g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
 23 |             cnt = 1
 24 |         l = g_vector.size(0)
 25 |         hessian = torch.zeros(l, l)
 26 |         for idx in range(l):
 27 |             g_vector[idx].requires_grad = True
 28 |             grad2rd = torch.autograd.grad(g_vector[idx], self.parameters(), create_graph=True)
 29 |             cnt = 0
 30 |             for g in grad2rd:
 31 |                 g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
 32 |                 cnt = 1
 33 |             hessian[idx] = g2
 34 |         return hessian.cpu().data.numpy()
 35 | 
 36 |     def fit(self,
 37 |             X,
 38 |             y,
 39 |             batch_size=32,
 40 |             epochs=10,
 41 |             verbose=1,
 42 |             validation_split=0.,
 43 |             validation_data=None,
 44 |             shuffle=True,
 45 |             initial_epoch=0,
 46 |             seed=None,
 47 |             loss=DEFAULT_LOSS,
 48 |             optimizer=DEFAULT_OPTIMIZER,
 49 |             metrics=None):
 50 |         """Trains the model similar to Keras' .fit(...) method
 51 | 
 52 |         # Arguments
 53 |             X: training data Tensor.
 54 |             y: target data Tensor.i
 55 |             batch_size: integer. Number of samples per gradient update.
 56 |             epochs: integer, the number of times to iterate
 57 |                 over the training data arrays.
 58 |             verbose: 0, 1. Verbosity mode.
 59 |                 0 = silent, 1 = verbose.
 60 |             validation_split: float between 0 and 1:
 61 |                 fraction of the training data to be used as validation data.
 62 |                 The model will set apart this fraction of the training data,
 63 |                 will not train on it, and will evaluate
 64 |                 the loss and any model metrics
 65 |                 on this data at the end of each epoch.
 66 |             validation_data: (x_val, y_val) tuple on which to evaluate
 67 |                 the loss and any model metrics
 68 |                 at the end of each epoch. The model will not
 69 |                 be trained on this data.
 70 |             shuffle: boolean, whether to shuffle the training data
 71 |                 before each epoch.
 72 |             initial_epoch: epoch at which to start training
 73 |                 (useful for resuming a previous training run)
 74 |             seed: random seed.
 75 |             optimizer: training optimizer
 76 |             loss: training loss
 77 |             metrics: list of functions with signatures `metric(y_true, y_pred)`
 78 |                 where y_true and y_pred are both Tensors
 79 | 
 80 |         # Returns
 81 |             list of OrderedDicts with training metrics
 82 |         """
 83 |         if seed and seed >= 0:
 84 |             torch.manual_seed(seed)
 85 |         # Prepare validation data
 86 |         if validation_data:
 87 |             X_val, y_val = validation_data
 88 |         elif validation_split and 0. < validation_split < 1.:
 89 |             split = int(X.size()[0] * (1. - validation_split))
 90 |             X, X_val = X[:split], X[split:]
 91 |             y, y_val = y[:split], y[split:]
 92 |         else:
 93 |             X_val, y_val = None, None
 94 |         # Build DataLoaders
 95 |         if isinstance(X, numpy.ndarray):
 96 |             X = torch.from_numpy(X).float() 
 97 |         if isinstance(y, numpy.ndarray):
 98 |             y = torch.from_numpy(y).float() 
 99 |         if isinstance(X_val, numpy.ndarray):
100 |             X_val = torch.from_numpy(X_val).float() 
101 |         if isinstance(y_val, numpy.ndarray):
102 |             y_val = torch.from_numpy(y_val).float() 
103 |         train_data = get_loader(X, y, batch_size, shuffle)
104 |         # Compile optimizer
105 |         opt = optimizer(self.parameters())
106 |         # Run training loop
107 |         logs = []
108 |         self.train()
109 |         for t in range(initial_epoch, epochs):
110 |             if verbose:
111 |                 print("Epoch {0} / {1}".format(t+1, epochs))
112 |             # Setup logger
113 |             if verbose:
114 |                 pb = ProgressBar(len(train_data))
115 |             log = OrderedDict()
116 |             epoch_loss = 0.0
117 |             # Run batches
118 |             for batch_i, batch_data in enumerate(train_data):
119 |                 # Get batch data
120 |                 X_batch = Variable(batch_data[0], requires_grad=True).float() 
121 |                 y_batch = Variable(batch_data[1], requires_grad=True).long()
122 |                 # Backprop
123 |                 opt.zero_grad()
124 |                 y_batch_pred = self(X_batch).float()
125 |                 batch_loss = loss(y_batch_pred, y_batch)
126 |                 batch_loss.backward()
127 |                 opt.step()
128 |                 # Update status
129 |                 epoch_loss += batch_loss.item()
130 |                 for param in self.parameters():
131 |                     param.requires_grad = True
132 |                 # print(y_val)
133 |                 # hessian = self.eval_hessian(y)
134 |                 # print(hessian.shape)
135 |                 # print(hessian)
136 |                 log['loss'] = float(epoch_loss) / (batch_i + 1)
137 |                 if verbose:
138 |                     pb.bar(batch_i, log_to_message(log))
139 |             # Run metrics
140 |             if metrics:
141 |                 y_train_pred = self.predict(X, batch_size)
142 |                 add_metrics_to_log(log, metrics, y, y_train_pred)
143 |             if X_val is not None and y_val is not None:
144 |                 y_val_pred = self.predict(X_val, batch_size)
145 |                 val_loss = loss(Variable(y_val_pred, requires_grad=True), Variable(y_val, requires_grad=True))
146 |                 log['val_loss'] = val_loss.data[0]
147 |                 if metrics:
148 |                     add_metrics_to_log(log, metrics, y_val, y_val_pred, 'val_')
149 |             logs.append(log)
150 |             if verbose:
151 |                 pb.close(log_to_message(log))
152 |         return logs
153 | 
154 |     def predict(self, X, batch_size=32):
155 |         """Generates output predictions for the input samples.
156 | 
157 |         Computation is done in batches.
158 | 
159 |         # Arguments
160 |             X: input data Tensor.
161 |             batch_size: integer.
162 | 
163 |         # Returns
164 |             prediction Tensor.
165 |         """
166 |         # Build DataLoader
167 |         data = get_loader(X, batch_size=batch_size)
168 |         # Batch prediction
169 |         self.eval()
170 |         r, n = 0, X.size()[0]
171 |         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
172 |         for batch_data in data:
173 |             # Predict on batch
174 |             X_batch = Variable(batch_data[0].type('torch.FloatTensor').to(device), requires_grad=True).type('torch.FloatTensor').to(device)
175 |             y_batch_pred = self(X_batch).data.type('torch.FloatTensor')
176 |             # Infer prediction shape
177 |             if r == 0:
178 |                 y_pred = (torch.zeros((n,) + y_batch_pred.size()[1:])).data.type('torch.FloatTensor')
179 |             # Add to prediction tensor
180 |             y_pred[r : min(n, r + batch_size)] = y_batch_pred
181 |             r += batch_size
182 |         return y_pred
183 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/models/resnet.py:
--------------------------------------------------------------------------------
  1 | '''ResNet in PyTorch.
  2 | 
  3 | BasicBlock and Bottleneck module is from the original ResNet paper:
  4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  5 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
  6 | 
  7 | PreActBlock and PreActBottleneck module is from the later paper:
  8 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  9 |     Identity Mappings in Deep Residual Networks. arXiv:1603.05027
 10 | '''
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | from pytorch_fitmodule import FitModule
 15 | from torch.autograd import Variable
 16 | import numpy as np
 17 | 
 18 | 
 19 | def conv3x3(in_planes, out_planes, stride=1):
 20 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
 21 | 
 22 | 
 23 | class BasicBlock(FitModule):
 24 |     expansion = 1
 25 | 
 26 |     def __init__(self, in_planes, planes, stride=1):
 27 |         super(BasicBlock, self).__init__()
 28 |         self.conv1 = conv3x3(in_planes, planes, stride)
 29 |         self.bn1 = nn.BatchNorm2d(planes)
 30 |         self.conv2 = conv3x3(planes, planes)
 31 |         self.bn2 = nn.BatchNorm2d(planes)
 32 | 
 33 |         self.shortcut = nn.Sequential()
 34 |         if stride != 1 or in_planes != self.expansion * planes:
 35 |             self.shortcut = nn.Sequential(
 36 |                 nn.Conv2d(in_planes, self.expansion * planes,
 37 |                           kernel_size=1, stride=stride, bias=False),
 38 |                 nn.BatchNorm2d(self.expansion * planes)
 39 |             )
 40 | 
 41 |     def forward(self, x):
 42 |         out = F.relu(self.bn1(self.conv1(x)))
 43 |         out = self.bn2(self.conv2(out))
 44 |         out += self.shortcut(x)
 45 |         out = F.relu(out)
 46 |         return out
 47 | 
 48 | 
 49 | class PreActBlock(FitModule):
 50 |     '''Pre-activation version of the BasicBlock.'''
 51 |     expansion = 1
 52 | 
 53 |     def __init__(self, in_planes, planes, stride=1):
 54 |         super(PreActBlock, self).__init__()
 55 |         self.bn1 = nn.BatchNorm2d(in_planes)
 56 |         self.conv1 = conv3x3(in_planes, planes, stride)
 57 |         self.bn2 = nn.BatchNorm2d(planes)
 58 |         self.conv2 = conv3x3(planes, planes)
 59 | 
 60 |         if stride != 1 or in_planes != self.expansion * planes:
 61 |             self.shortcut = nn.Sequential(
 62 |                 nn.Conv2d(in_planes, self.expansion * planes,
 63 |                           kernel_size=1, stride=stride, bias=False)
 64 |             )
 65 | 
 66 |     def forward(self, x):
 67 |         out = F.relu(self.bn1(x))
 68 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 69 |         out = self.conv1(out)
 70 |         out = self.conv2(F.relu(self.bn2(out)))
 71 |         out += shortcut
 72 |         return out
 73 | 
 74 | 
 75 | class Bottleneck(FitModule):
 76 |     expansion = 4
 77 | 
 78 |     def __init__(self, in_planes, planes, stride=1):
 79 |         super(Bottleneck, self).__init__()
 80 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 81 |         self.bn1 = nn.BatchNorm2d(planes)
 82 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 83 |         self.bn2 = nn.BatchNorm2d(planes)
 84 |         self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
 85 |         self.bn3 = nn.BatchNorm2d(self.expansion * planes)
 86 | 
 87 |         self.shortcut = nn.Sequential()
 88 |         if stride != 1 or in_planes != self.expansion * planes:
 89 |             self.shortcut = nn.Sequential(
 90 |                 nn.Conv2d(in_planes, self.expansion * planes,
 91 |                           kernel_size=1, stride=stride, bias=False),
 92 |                 nn.BatchNorm2d(self.expansion * planes)
 93 |             )
 94 | 
 95 |     def forward(self, x):
 96 |         out = F.relu(self.bn1(self.conv1(x)))
 97 |         out = F.relu(self.bn2(self.conv2(out)))
 98 |         out = self.bn3(self.conv3(out))
 99 |         out += self.shortcut(x)
100 |         out = F.relu(out)
101 |         return out
102 | 
103 | 
104 | class PreActBottleneck(FitModule):
105 |     '''Pre-activation version of the original Bottleneck module.'''
106 |     expansion = 4
107 | 
108 |     def __init__(self, in_planes, planes, stride=1):
109 |         super(PreActBottleneck, self).__init__()
110 |         self.bn1 = nn.BatchNorm2d(in_planes)
111 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
112 |         self.bn2 = nn.BatchNorm2d(planes)
113 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
114 |         self.bn3 = nn.BatchNorm2d(planes)
115 |         self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
116 | 
117 |         if stride != 1 or in_planes != self.expansion * planes:
118 |             self.shortcut = nn.Sequential(
119 |                 nn.Conv2d(in_planes, self.expansion * planes,
120 |                           kernel_size=1, stride=stride, bias=False)
121 |             )
122 | 
123 |     def forward(self, x):
124 |         out = F.relu(self.bn1(x))
125 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
126 |         out = self.conv1(out)
127 |         out = self.conv2(F.relu(self.bn2(out)))
128 |         out = self.conv3(F.relu(self.bn3(out)))
129 |         out += shortcut
130 |         return out
131 | 
132 | 
133 | class ResNet(FitModule):
134 |     def __init__(self, block, num_blocks, num_classes=10):
135 |         super(ResNet, self).__init__()
136 |         self.in_planes = 64
137 | 
138 |         self.conv1 = conv3x3(3, 64)
139 |         self.bn1 = nn.BatchNorm2d(64)
140 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
141 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
142 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
143 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
144 |         self.linear = nn.Linear(512 * block.expansion, num_classes)
145 | 
146 |     def _make_layer(self, block, planes, num_blocks, stride):
147 |         strides = [stride] + [1] * (num_blocks - 1)
148 |         layers = []
149 |         for stride in strides:
150 |             layers.append(block(self.in_planes, planes, stride))
151 |             self.in_planes = planes * block.expansion
152 |         return nn.Sequential(*layers)
153 | 
154 |     def freeze_hidden_layers(self):
155 |         self._freeze_layer(self.conv1)
156 |         self._freeze_layer(self.bn1)
157 |         self._freeze_layer(self.layer1)
158 |         self._freeze_layer(self.layer2)
159 |         self._freeze_layer(self.layer3)
160 |         self._freeze_layer(self.layer4)
161 | 
162 |     def unfreeze_model(self):
163 |         self._freeze_layer(self.conv1, freeze=False)
164 |         self._freeze_layer(self.bn1, freeze=False)
165 |         self._freeze_layer(self.layer1, freeze=False)
166 |         self._freeze_layer(self.layer2, freeze=False)
167 |         self._freeze_layer(self.layer3, freeze=False)
168 |         self._freeze_layer(self.layer4, freeze=False)
169 |         self._freeze_layer(self.linear, freeze=False)
170 | 
171 |     def embed_in_n_layer(self, n):
172 |         self._freeze_layer(self.conv1)
173 |         self._freeze_layer(self.bn1)
174 |         if n == 1:
175 |             self._freeze_layer(self.layer1)
176 |         elif n == 2:
177 |             self._freeze_layer(self.layer2)
178 |         elif n == 3:
179 |             self._freeze_layer(self.layer3)
180 |         elif n == 4:
181 |             self._freeze_layer(self.layer4)
182 |         else:
183 |             self._freeze_layer(self.linear)
184 | 
185 |     def _freeze_layer(self, layer, freeze=True):
186 |         if freeze:
187 |             for p in layer.parameters():
188 |                 p.requires_grad = False
189 |         else:
190 |             for p in layer.parameters():
191 |                 p.requires_grad = True
192 | 
193 |     def forward(self, x):
194 |         x = x.float() 
195 |         out = F.relu(self.bn1(self.conv1(x).float()).float())
196 |         out = self.layer1(out)
197 |         out = self.layer2(out)
198 |         out = self.layer3(out)
199 |         out = self.layer4(out)
200 |         out = F.avg_pool2d(out, 4)
201 |         out = out.view(out.size(0), -1)
202 |         out = self.linear(out)
203 |         return out
204 | 
205 |     def score(self, X, y):
206 |         if isinstance(X, list):
207 |             X = np.array(X)
208 |         if isinstance(y, list):
209 |             y = np.array(y)
210 |         if isinstance(X, np.ndarray):
211 |             X = torch.from_numpy(X)
212 |         if isinstance(y, np.ndarray):
213 |             y = torch.from_numpy(y)
214 |         y_pred = self.predict(X)
215 |         return np.mean(y.numpy() == np.argmax(y_pred.numpy(), axis=1))
216 | 
217 | 
218 | def ResNet18(num_classes=10):
219 |     return ResNet(PreActBlock, [2, 2, 2, 2], num_classes)
220 | 
221 | 
222 | def ResNet34():
223 |     return ResNet(BasicBlock, [3, 4, 6, 3])
224 | 
225 | 
226 | def ResNet50():
227 |     return ResNet(Bottleneck, [3, 4, 6, 3])
228 | 
229 | 
230 | def ResNet101():
231 |     return ResNet(Bottleneck, [3, 4, 23, 3])
232 | 
233 | 
234 | def ResNet152():
235 |     return ResNet(Bottleneck, [3, 8, 36, 3])
236 | 
237 | 
238 | def test():
239 |     net = ResNet18()
240 |     y = net(Variable(torch.randn(1, 3, 32, 32), requires_grad=True))
241 |     print(y.size())
242 | 
243 | # test()
244 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/PlotPoisoning.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import os
  5 | from sklearn.decomposition import PCA
  6 | from sklearn.neighbors import KernelDensity
  7 | from scipy.stats import multivariate_normal
  8 | from sklearn.manifold import TSNE
  9 | import seaborn as sns
 10 | 
 11 | sns.set()
 12 | 
 13 | watermarked = pickle.load(open("watermarked.pkl", "rb"), encoding = "iso-8859-1")
 14 | # tmc_v = pickle.load(open('tmc.pkl', 'rb'), encoding = "iso-8859-1")
 15 | # tmc_i = np.argsort(-tmc_v)[::-1]
 16 | # cnt = 0
 17 | # f = []
 18 | # total = 0
 19 | # cnt = 0
 20 | # for i in range(len(tmc_i)):
 21 | #     if watermarked[int(tmc_i[i])] == 1:
 22 | #         total += 1
 23 | # for i in range(len(tmc_i)):
 24 | #     if watermarked[int(tmc_i[i])] == 1:
 25 | #         cnt += 1
 26 | #     f.append(1.0 * cnt / total)
 27 | # x = np.array(range(1, len(tmc_i) + 1)) / len(tmc_i) * 100
 28 | # x = np.append(x[0:-1:200], x[-1])
 29 | # f = np.append(f[0:-1:200], f[-1])
 30 | # plt.plot(x, np.array(f) * 100, 's-', color = 'blue', label = "TMC-Shapley")
 31 | 
 32 | # g_v = pickle.load(open('g.pkl', 'rb'), encoding = "iso-8859-1")
 33 | # g_i = np.argsort(-g_v)[::-1]
 34 | # cnt = 0
 35 | # f = []
 36 | # total = 0
 37 | # cnt = 0
 38 | # for i in range(len(g_i)):
 39 | #     if watermarked[int(g_i[i])] == 1:
 40 | #         total += 1
 41 | # for i in range(len(g_i)):
 42 | #     if watermarked[int(g_i[i])] == 1:
 43 | #         cnt += 1
 44 | #     f.append(1.0 * cnt / total)
 45 | # x = np.array(range(1, len(g_i) + 1)) / len(g_i) * 100
 46 | # x = np.append(x[0:-1:200], x[-1])
 47 | # f = np.append(f[0:-1:200], f[-1])
 48 | # plt.plot(x, np.array(f) * 100, 's-', color = 'orange', label = "G-Shapley", zorder=5)
 49 | 
 50 | 
 51 | # loo_v = pickle.load(open('loo.pkl', 'rb'), encoding = "iso-8859-1")["loo"]
 52 | # loo_i = np.argsort(-loo_v)[::-1]
 53 | # cnt = 0
 54 | # f = []
 55 | # total = 0
 56 | # cnt = 0
 57 | # for i in range(len(loo_i)):
 58 | #     if watermarked[int(loo_i[i])] == 1:
 59 | #         total += 1
 60 | # for i in range(len(loo_i)):
 61 | #     if watermarked[int(loo_i[i])] == 1:
 62 | #         cnt += 1
 63 | #     f.append(1.0 * cnt / total)
 64 | # x = np.array(range(1, len(loo_i) + 1)) / len(loo_i) * 100
 65 | # x = np.append(x[0:-1:200], x[-1])
 66 | # f = np.append(f[0:-1:200], f[-1])
 67 | # plt.plot(x, np.array(f) * 100, '^-', color = 'olive', label = "Leave-One-Out", zorder=4, alpha=0.8)
 68 | 
 69 | # for K in range(10, 11):
 70 | #     knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 71 | #     knn1_v = pickle.load(open('knn_layer1_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 72 | #     knn2_v = pickle.load(open('knn_layer2_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 73 | #     knn3_v = pickle.load(open('knn_layer3_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 74 | #     knn_v = (knn1_v + knn2_v + knn3_v + knn_v) / 4
 75 | #     knn_i = np.argsort(-knn_v)[::-1]
 76 | #    cnt = 0
 77 | #     f = []
 78 | #     total = 0
 79 | #     cnt = 0
 80 | #     for i in range(len(knn_i)):
 81 | #         if watermarked[int(knn_i[i])] == 1:
 82 | #             total += 1
 83 | #     for i in range(len(knn_i)):
 84 | #         if watermarked[int(knn_i[i])] == 1:
 85 | #             cnt += 1
 86 | #         f.append(1.0 * cnt / total)
 87 | #     x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
 88 | #     plt.plot(x, np.array(f) * 100, color = 'violet', label = 'average-KNN-Shapley (k={})'.format(K))
 89 | 
 90 | # for K in range(10, 11):
 91 | #     knn_v = pickle.load(open('looknn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
 92 | #     knn_i = np.argsort(-knn_v)[::-1]
 93 | #     cnt = 0
 94 | #     f = []
 95 | #     total = 0
 96 | #     cnt = 0
 97 | #     a1 = []
 98 | #     a0 = []
 99 | #     for i in range(len(knn_i)):
100 | #         if watermarked[int(knn_i[i])] == 1:
101 | #             total += 1
102 | #     for i in range(len(knn_i)):
103 | #         if watermarked[int(knn_i[i])] == 1:
104 | #             cnt += 1
105 | #         f.append(1.0 * cnt / total)
106 | #     x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
107 | #     x = np.append(x[0:-1:200], x[-1])
108 | #     f = np.append(f[0:-1:200], f[-1])
109 | #     plt.plot(x, np.array(f) * 100, 'o-', color='violet', label = 'KNN-LOO-Shapley'.format(K), zorder=6, alpha=0.8)
110 | 
111 | 
112 | for K in range(10, 11):
113 |     knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
114 |     knn_v = np.mean(knn_v, axis=1)
115 |     knn_i = np.argsort(-knn_v)[::-1]
116 |     cnt = 0
117 |     f = []
118 |     total = 0
119 |     cnt = 0
120 |     for i in range(len(knn_i)):
121 |         if watermarked[int(knn_i[i])] == 1:
122 |             total += 1
123 |     for i in range(len(knn_i)):
124 |         if watermarked[int(knn_i[i])] == 1:
125 |             cnt += 1
126 |         f.append(1.0 * cnt / total)
127 |     x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
128 |     x = np.append(x[0:-1:200], x[-1])
129 |     f = np.append(f[0:-1:200], f[-1])
130 |     plt.plot(x, np.array(f) * 100, 'o-', color='purple', label = 'KNN-Shapley'.format(K), linewidth=3)
131 |     # for i in range(len(knn_i)):
132 |     #     if watermarked[int(knn_i[i])] == 1:
133 |     #         print(knn_v[knn_i[i]])
134 |     #         a1.append(knn_v[knn_i[i]])
135 |     #     else:
136 |     #         a0.append(knn_v[knn_i[i]])
137 |     # plt.hist(a0, bins=30, color='blue', histtype='stepfilled', label = 'benign data')
138 |     # plt.hist(a1, bins=30, color='red', histtype='stepfilled', label = 'poisoned data')
139 |     # 
140 | for K in range(10, 11):
141 |     knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
142 |     knn_v = np.max(knn_v, axis=1)
143 |     knn_i = np.argsort(-knn_v)[::-1]
144 |     cnt = 0
145 |     f = []
146 |     total = 0
147 |     cnt = 0
148 |     for i in range(len(knn_i)):
149 |         if watermarked[int(knn_i[i])] == 1:
150 |             total += 1
151 |     for i in range(len(knn_i)):
152 |         if watermarked[int(knn_i[i])] == 1:
153 |             cnt += 1
154 |         f.append(1.0 * cnt / total)
155 |     x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
156 |     x = np.append(x[0:-1:200], x[-1])
157 |     f = np.append(f[0:-1:200], f[-1])
158 |     plt.plot(x, np.array(f) * 100, 'o-', color='green', label = 'max-KNN-Shapley'.format(K), linewidth=3)
159 | #     # a = []
160 | #     # plt.figure(figsize=(12, 6))
161 | #     # plt.subplot(121)
162 | #     # for j in range(len(knn_v)):
163 | #     #     if not watermarked[j]:
164 | #     #         a.append(np.mean(knn_v[j]))
165 | #     # plt.hist(a, bins=100, range=(0, 0.002), color='blue', histtype='stepfilled', label = 'benign (mean value)')
166 | #     # a = []
167 | #     # plt.subplot(122)
168 | #     # for j in range(len(knn_v)):
169 | #     #     if watermarked[j]:
170 | #     #         a.append(np.mean(knn_v[j]))
171 | #     # plt.hist(a, bins=100, range=(0, 0.002), color='red', histtype='stepfilled', label = 'watermarked (mean value)')
172 | 
173 | # knn_v = pickle.load(open('knn_{}.pkl'.format(K), 'rb'), encoding = "iso-8859-1")
174 | # knn_v = np.sort(knn_v, axis=1)
175 | # pca = PCA(n_components=2)
176 | # pca.fit(knn_v)
177 | # knn_v_pca = pca.fit_transform(knn_v)
178 | # # the bandwidth can be tunable
179 | # kde = KernelDensity(kernel='exponential', bandwidth=0.02).fit(knn_v_pca)
180 | # score = kde.score_samples(knn_v_pca)
181 | # knn_i = np.argsort(-score)[::-1]
182 | # cnt = 0
183 | # f = []
184 | # total = 0
185 | # cnt = 0
186 | # for i in range(len(knn_i)):
187 | #     if watermarked[int(knn_i[i])] == 1:
188 | #         total += 1
189 | # for i in range(len(knn_i)):
190 | #     if watermarked[int(knn_i[i])] == 1:
191 | #         cnt += 1
192 | #     f.append(1.0 * cnt / total)
193 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
194 | # plt.plot(x, np.array(f) * 100, color = 'blue', label = 'KDE-KNN-Shapley (k={})'.format(K))
195 | 
196 | 
197 | # tsne = TSNE(n_components=2,perplexity=50)
198 | # knn_v_tsne = tsne.fit_transform(knn_v)
199 | # knn_mean = np.mean(knn_v_tsne, axis=0)
200 | # knn_cov = np.cov(knn_v_tsne, rowvar=0)
201 | # score = multivariate_normal.pdf(knn_v_tsne, mean=knn_mean, cov=knn_cov)
202 | # knn_i = np.argsort(-score)[::-1]
203 | # cnt = 0
204 | # f = []
205 | # total = 0
206 | # cnt = 0
207 | # for i in range(len(knn_i)):
208 | #     if watermarked[int(knn_i[i])] == 1:
209 | #         total += 1
210 | # for i in range(len(knn_i)):
211 | #     if watermarked[int(knn_i[i])] == 1:
212 | #         cnt += 1
213 | #     f.append(1.0 * cnt / total)
214 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
215 | # plt.plot(x, np.array(f) * 100, color = 'darkblue', label = 'TSNE-KNN-Shapley (k={})'.format(K))
216 | 
217 | # pca = PCA(n_components=2)
218 | # pca.fit(knn_v)
219 | # knn_v_pca = pca.fit_transform(knn_v)
220 | # knn_mean = np.mean(knn_v_pca, axis=0)
221 | # knn_cov = np.cov(knn_v_pca, rowvar=0)
222 | # score = multivariate_normal.pdf(knn_v_pca, mean=knn_mean, cov=knn_cov)
223 | # knn_i = np.argsort(-score)[::-1]
224 | # cnt = 0
225 | # f = []
226 | # total = 0
227 | # cnt = 0
228 | # for i in range(len(knn_i)):
229 | #     if watermarked[int(knn_i[i])] == 1:
230 | #         total += 1
231 | # for i in range(len(knn_i)):
232 | #     if watermarked[int(knn_i[i])] == 1:
233 | #         cnt += 1
234 | #     f.append(1.0 * cnt / total)
235 | # x = np.array(range(1, len(knn_i) + 1)) / len(knn_i) * 100
236 | # plt.plot(x, np.array(f) * 100, color = 'lightblue', label = 'Gaussian-KNN-Shapley (k={})'.format(K))
237 | 
238 | ran_v = np.random.rand(len(knn_v, ))
239 | ran_i = np.argsort(-ran_v)[::-1]
240 | cnt = 0
241 | f = []
242 | total = 0
243 | cnt = 0
244 | for i in range(len(ran_i)):
245 |     if watermarked[int(ran_i[i])] == 1:
246 |         total += 1
247 | for i in range(len(ran_i)):
248 |     if watermarked[int(ran_i[i])] == 1:
249 |         cnt += 1
250 |     f.append(1.0 * cnt / total)
251 | x = np.array(range(1, len(ran_i) + 1)) / len(ran_i) * 100
252 | f = x / 100
253 | plt.plot(x, np.array(f) * 100, '--', color='red', label = "Random", zorder=7)
254 | 
255 | plt.xlabel('Fraction of data inspected (%)')
256 | plt.ylabel('Fraction of backdoor images detected (%)')
257 | plt.legend(loc='lower right')
258 | plt.show()


--------------------------------------------------------------------------------
/reproduction/Cifar10/accuracy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import bz2\n",
 10 |     "import numpy as np\n",
 11 |     "from tqdm import tqdm_notebook as tqdm\n",
 12 |     "import gzip\n",
 13 |     "from heapq import heappushpop\n",
 14 |     "from joblib import Parallel, delayed\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "data = np.load('CIFAR10_resnet50-keras_features.npz')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "x_trn = np.vstack((data['features_training'], data['features_testing']))\n",
 34 |     "y_trn = np.hstack((data['labels_training'], data['labels_testing']))"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 4,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from sklearn.utils import shuffle\n",
 44 |     "x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "x_trn = np.reshape(x_trn, (-1, 2048))\n",
 54 |     "x_tst, y_tst = x_trn[:100], y_trn[:100]\n",
 55 |     "x_val, y_val = x_trn[100:1100], y_trn[100:1100]\n",
 56 |     "x_trn, y_trn = x_trn[1100:], y_trn[1100:]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 7,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
 68 |        "           metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n",
 69 |        "           weights='uniform')"
 70 |       ]
 71 |      },
 72 |      "execution_count": 7,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 79 |     "neigh = KNeighborsClassifier(n_neighbors=1)\n",
 80 |     "neigh.fit(x_trn, y_trn)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 8,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "0.852"
 92 |       ]
 93 |      },
 94 |      "execution_count": 8,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "neigh.score(x_val, y_val)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 9,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "0.81"
112 |       ]
113 |      },
114 |      "execution_count": 9,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "neigh.score(x_tst, y_tst)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 6,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stderr",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "/home/wbx/tensorflow/lib/python3.5/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n",
133 |       "  \"of iterations.\", ConvergenceWarning)\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "from sklearn.linear_model import LogisticRegression\n",
139 |     "clf = LogisticRegression(random_state=0, solver='lbfgs',\n",
140 |     "                         multi_class='multinomial').fit(x_trn, y_trn)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 7,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "0.91"
152 |       ]
153 |      },
154 |      "execution_count": 7,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "clf.score(x_val, y_val)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 8,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "0.87"
172 |       ]
173 |      },
174 |      "execution_count": 8,
175 |      "metadata": {},
176 |      "output_type": "execute_result"
177 |     }
178 |    ],
179 |    "source": [
180 |     "clf.score(x_tst, y_tst)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 9,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "0.9502886247877759"
192 |       ]
193 |      },
194 |      "execution_count": 9,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "clf.score(x_trn, y_trn)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 7,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stderr",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "Using TensorFlow backend.\n"
213 |      ]
214 |     },
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "Epoch 1/5\n",
220 |       "58900/58900 [==============================] - 11s 184us/step - loss: 0.4147 - acc: 0.8590\n",
221 |       "Epoch 2/5\n",
222 |       "58900/58900 [==============================] - 7s 113us/step - loss: 0.2936 - acc: 0.8993\n",
223 |       "Epoch 3/5\n",
224 |       "58900/58900 [==============================] - 7s 112us/step - loss: 0.2668 - acc: 0.9075\n",
225 |       "Epoch 4/5\n",
226 |       "58900/58900 [==============================] - 7s 114us/step - loss: 0.2493 - acc: 0.9139\n",
227 |       "Epoch 5/5\n",
228 |       "58900/58900 [==============================] - 7s 112us/step - loss: 0.2363 - acc: 0.9176\n"
229 |      ]
230 |     },
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "<keras.callbacks.History at 0x7f1c300b45c0>"
235 |       ]
236 |      },
237 |      "execution_count": 7,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "import os\n",
244 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
245 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"5\"\n",
246 |     "\n",
247 |     "from keras.models import Sequential\n",
248 |     "from keras.layers import Dense, Activation\n",
249 |     "\n",
250 |     "model = Sequential()\n",
251 |     "model.add(Dense(10, input_dim=2048))\n",
252 |     "model.add(Activation('softmax'))\n",
253 |     "\n",
254 |     "model.compile(loss='sparse_categorical_crossentropy',\n",
255 |     "              optimizer='sgd',\n",
256 |     "              metrics=['accuracy'])\n",
257 |     "\n",
258 |     "model.fit(x_trn, y_trn, epochs=5, batch_size=32)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 8,
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "name": "stdout",
268 |      "output_type": "stream",
269 |      "text": [
270 |       "\r",
271 |       "100/100 [==============================] - 0s 505us/step\n"
272 |      ]
273 |     },
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "[0.39001956582069397, 0.8700000047683716]"
278 |       ]
279 |      },
280 |      "execution_count": 8,
281 |      "metadata": {},
282 |      "output_type": "execute_result"
283 |     }
284 |    ],
285 |    "source": [
286 |     "model.evaluate(x_tst, y_tst, batch_size=128)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 9,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "1000/1000 [==============================] - 0s 35us/step\n"
299 |      ]
300 |     },
301 |     {
302 |      "data": {
303 |       "text/plain": [
304 |        "[0.26140422391891477, 0.911]"
305 |       ]
306 |      },
307 |      "execution_count": 9,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "model.evaluate(x_val, y_val, batch_size=128)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 10,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "name": "stdout",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "Epoch 1/5\n",
326 |       "58900/58900 [==============================] - 7s 115us/step - loss: 0.2279 - acc: 0.9210\n",
327 |       "Epoch 2/5\n",
328 |       "58900/58900 [==============================] - 6s 109us/step - loss: 0.2187 - acc: 0.9253\n",
329 |       "Epoch 3/5\n",
330 |       "58900/58900 [==============================] - 7s 121us/step - loss: 0.2126 - acc: 0.9269\n",
331 |       "Epoch 4/5\n",
332 |       "58900/58900 [==============================] - 7s 113us/step - loss: 0.2066 - acc: 0.9289\n",
333 |       "Epoch 5/5\n",
334 |       "58900/58900 [==============================] - 6s 110us/step - loss: 0.2015 - acc: 0.9309\n"
335 |      ]
336 |     },
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "<keras.callbacks.History at 0x7f1d9f0d2eb8>"
341 |       ]
342 |      },
343 |      "execution_count": 10,
344 |      "metadata": {},
345 |      "output_type": "execute_result"
346 |     }
347 |    ],
348 |    "source": [
349 |     "model.fit(x_trn, y_trn, epochs=5, batch_size=32)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 11,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "name": "stdout",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "\r",
362 |       "100/100 [==============================] - 0s 47us/step\n"
363 |      ]
364 |     },
365 |     {
366 |      "data": {
367 |       "text/plain": [
368 |        "[0.36326342821121216, 0.8700000047683716]"
369 |       ]
370 |      },
371 |      "execution_count": 11,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "model.evaluate(x_tst, y_tst, batch_size=128)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 12,
383 |    "metadata": {},
384 |    "outputs": [
385 |     {
386 |      "name": "stdout",
387 |      "output_type": "stream",
388 |      "text": [
389 |       "1000/1000 [==============================] - 0s 30us/step\n"
390 |      ]
391 |     },
392 |     {
393 |      "data": {
394 |       "text/plain": [
395 |        "[0.2430015230178833, 0.9180000023841858]"
396 |       ]
397 |      },
398 |      "execution_count": 12,
399 |      "metadata": {},
400 |      "output_type": "execute_result"
401 |     }
402 |    ],
403 |    "source": [
404 |     "model.evaluate(x_val, y_val, batch_size=128)"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": 13,
410 |    "metadata": {},
411 |    "outputs": [
412 |     {
413 |      "name": "stdout",
414 |      "output_type": "stream",
415 |      "text": [
416 |       "Epoch 1/5\n",
417 |       "58900/58900 [==============================] - 7s 117us/step - loss: 0.1968 - acc: 0.9324\n",
418 |       "Epoch 2/5\n",
419 |       "58900/58900 [==============================] - 7s 112us/step - loss: 0.1929 - acc: 0.9346\n",
420 |       "Epoch 3/5\n",
421 |       "58900/58900 [==============================] - 6s 96us/step - loss: 0.1887 - acc: 0.9349\n",
422 |       "Epoch 4/5\n",
423 |       "58900/58900 [==============================] - 6s 96us/step - loss: 0.1857 - acc: 0.9359\n",
424 |       "Epoch 5/5\n",
425 |       "58900/58900 [==============================] - 7s 113us/step - loss: 0.1823 - acc: 0.9382\n"
426 |      ]
427 |     },
428 |     {
429 |      "data": {
430 |       "text/plain": [
431 |        "<keras.callbacks.History at 0x7f1da4ba0f60>"
432 |       ]
433 |      },
434 |      "execution_count": 13,
435 |      "metadata": {},
436 |      "output_type": "execute_result"
437 |     }
438 |    ],
439 |    "source": [
440 |     "model.fit(x_trn, y_trn, epochs=5, batch_size=32)"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 14,
446 |    "metadata": {},
447 |    "outputs": [
448 |     {
449 |      "name": "stdout",
450 |      "output_type": "stream",
451 |      "text": [
452 |       "\r",
453 |       "100/100 [==============================] - 0s 210us/step\n"
454 |      ]
455 |     },
456 |     {
457 |      "data": {
458 |       "text/plain": [
459 |        "[0.34444886445999146, 0.8600000143051147]"
460 |       ]
461 |      },
462 |      "execution_count": 14,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "model.evaluate(x_tst, y_tst, batch_size=128)"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 16,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "1000/1000 [==============================] - 0s 30us/step\n"
481 |      ]
482 |     },
483 |     {
484 |      "data": {
485 |       "text/plain": [
486 |        "[0.25385982036590576, 0.9090000023841858]"
487 |       ]
488 |      },
489 |      "execution_count": 16,
490 |      "metadata": {},
491 |      "output_type": "execute_result"
492 |     }
493 |    ],
494 |    "source": [
495 |     "model.evaluate(x_val, y_val, batch_size=128)"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": []
504 |   }
505 |  ],
506 |  "metadata": {
507 |   "kernelspec": {
508 |    "display_name": "Python 3",
509 |    "language": "python",
510 |    "name": "python3"
511 |   },
512 |   "language_info": {
513 |    "codemirror_mode": {
514 |     "name": "ipython",
515 |     "version": 3
516 |    },
517 |    "file_extension": ".py",
518 |    "mimetype": "text/x-python",
519 |    "name": "python",
520 |    "nbconvert_exporter": "python",
521 |    "pygments_lexer": "ipython3",
522 |    "version": "3.5.2"
523 |   }
524 |  },
525 |  "nbformat": 4,
526 |  "nbformat_minor": 2
527 | }
528 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/shap_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | from scipy.stats import logistic
  5 | from scipy.stats import spearmanr
  6 | from sklearn.naive_bayes import MultinomialNB
  7 | from sklearn.linear_model import LogisticRegression
  8 | from sklearn.linear_model import LinearRegression, Ridge
  9 | from sklearn.metrics import r2_score
 10 | from sklearn.neural_network import MLPRegressor, MLPClassifier
 11 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
 12 | from sklearn.neighbors import KNeighborsClassifier
 13 | from sklearn.tree import DecisionTreeClassifier
 14 | from sklearn.naive_bayes import MultinomialNB, GaussianNB
 15 | from sklearn.gaussian_process import GaussianProcessClassifier
 16 | from sklearn.svm import SVC, LinearSVC
 17 | from sklearn.base import clone
 18 | import inspect
 19 | from Shapley import ShapNN, CShapNN
 20 | from multiprocessing import dummy as multiprocessing
 21 | from sklearn.metrics import roc_auc_score, f1_score
 22 | import warnings
 23 | import tensorflow as tf
 24 | import matplotlib.pyplot as plt
 25 | import torch
 26 | import torch.nn as nn
 27 | import torch.nn.functional as F
 28 | from torch.autograd import Variable
 29 | from models import ResNet18
 30 |         
 31 | def convergence_plots(marginals):
 32 |     
 33 |     plt.rcParams['figure.figsize'] = 15,15
 34 |     for i, idx in enumerate(np.arange(min(25, marginals.shape[-1]))):
 35 |         plt.subplot(5,5,i+1)
 36 |         plt.plot(np.cumsum(marginals[:, idx])/np.arange(1, len(marginals)+1))    
 37 |         
 38 |     
 39 | def is_integer(array):
 40 |     return (np.equal(np.mod(array, 1), 0).mean()==1)
 41 | 
 42 | 
 43 | def is_fitted(model):
 44 |         """Checks if model object has any attributes ending with an underscore"""
 45 |         return 0 < len( [k for k,v in inspect.getmembers(model) if k.endswith('_') and not k.startswith('__')] )
 46 | 
 47 | 
 48 | def return_model(mode, **kwargs):
 49 |     
 50 |     if mode=='logistic':
 51 |         solver = kwargs.get('solver', 'liblinear')
 52 |         n_jobs = kwargs.get('n_jobs', None)
 53 |         max_iter = kwargs.get('max_iter', 5000)
 54 |         model = LogisticRegression(solver=solver, n_jobs=n_jobs, 
 55 |                                  max_iter=max_iter, random_state=666,
 56 |                                  multi_class='auto')
 57 |     elif mode=='Tree':
 58 |         model = DecisionTreeClassifier(random_state=666)
 59 |     elif mode=='RandomForest':
 60 |         n_estimators = kwargs.get('n_estimators', 50)
 61 |         model = RandomForestClassifier(n_estimators=n_estimators, random_state=666)
 62 |     elif mode=='GB':
 63 |         n_estimators = kwargs.get('n_estimators', 50)
 64 |         model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666)
 65 |     elif mode=='AdaBoost':
 66 |         n_estimators = kwargs.get('n_estimators', 50)
 67 |         model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666)
 68 |     elif mode=='SVC':
 69 |         kernel = kwargs.get('kernel', 'rbf')
 70 |         model = SVC(kernel=kernel, random_state=666)
 71 |     elif mode=='LinearSVC':
 72 |         model = LinearSVC(loss='hinge', random_state=666)
 73 |     elif mode=='GP':
 74 |         model = GaussianProcessClassifier(random_state=666)
 75 |     elif mode=='KNN':
 76 |         n_neighbors = kwargs.get('n_neighbors', 5)
 77 |         model = KNeighborsClassifier(n_neighbors=n_neighbors)
 78 |     elif mode=='NB':
 79 |         model = MultinomialNB()
 80 |     elif mode=='linear':
 81 |         model = LinearRegression(random_state=666)
 82 |     elif mode=='ridge':
 83 |         alpha = kwargs.get('alpha', 1.0)
 84 |         model = Ridge(alpha=alpha, random_state=666)
 85 |     elif mode=='ResNet':
 86 |         model = ResNet18(num_classes=kwargs.get('num_classes', 10))
 87 |     elif 'conv' in mode:
 88 |         tf.reset_default_graph()
 89 |         address = kwargs.get('address', 'weights/conv')
 90 |         hidden_units = kwargs.get('hidden_layer_sizes', [20])
 91 |         activation = kwargs.get('activation', 'relu')
 92 |         weight_decay = kwargs.get('weight_decay', 1e-4)
 93 |         learning_rate = kwargs.get('learning_rate', 0.001)
 94 |         max_iter = kwargs.get('max_iter', 1000)
 95 |         early_stopping= kwargs.get('early_stopping', 10)
 96 |         warm_start = kwargs.get('warm_start', False)
 97 |         batch_size = kwargs.get('batch_size', 256)
 98 |         kernel_sizes = kwargs.get('kernel_sizes', [5])
 99 |         strides = kwargs.get('strides', [5])
100 |         channels = kwargs.get('channels', [1])
101 |         validation_fraction = kwargs.get('validation_fraction', 0.)
102 |         global_averaging = kwargs.get('global_averaging', 0.)
103 |         optimizer = kwargs.get('optimizer', 'sgd')
104 |         if mode=='conv':
105 |             model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter,
106 |                           learning_rate=learning_rate, 
107 |                           weight_decay=weight_decay, validation_fraction=validation_fraction,
108 |                           early_stopping=early_stopping,
109 |                          optimizer=optimizer, warm_start=warm_start, address=address,
110 |                           hidden_units=hidden_units,
111 |                           strides=strides, global_averaging=global_averaging,
112 |                          kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
113 |         elif mode=='conv_reg':
114 |             model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter,
115 |                           learning_rate=learning_rate, 
116 |                           weight_decay=weight_decay, validation_fraction=validation_fraction,
117 |                           early_stopping=early_stopping,
118 |                          optimizer=optimizer, warm_start=warm_start, address=address,
119 |                           hidden_units=hidden_units,
120 |                           strides=strides, global_averaging=global_averaging,
121 |                          kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
122 |     elif 'NN' in mode:
123 |         solver = kwargs.get('solver', 'sgd')
124 |         hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,))
125 |         if isinstance(hidden_layer_sizes, list):
126 |             hidden_layer_sizes = list(hidden_layer_sizes)
127 |         activation = kwargs.get('activation', 'relu')
128 |         learning_rate_init = kwargs.get('learning_rate', 0.001)
129 |         max_iter = kwargs.get('max_iter', 5000)
130 |         early_stopping= kwargs.get('early_stopping', False)
131 |         warm_start = kwargs.get('warm_start', False)
132 |         if mode=='NN':
133 |             model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
134 |                                 activation=activation, learning_rate_init=learning_rate_init,
135 |                                 warm_start = warm_start, max_iter=max_iter,
136 |                                 early_stopping=early_stopping)
137 |         if mode=='NN_reg':
138 |             model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
139 |                                 activation=activation, learning_rate_init=learning_rate_init,
140 |                                 warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping)
141 |     else:
142 |         raise ValueError("Invalid mode!")
143 |     return model
144 | 
145 | 
146 | 
147 | def generate_features(latent, dependency):
148 | 
149 |     features = []
150 |     n = latent.shape[0]
151 |     exp = latent
152 |     holder = latent
153 |     for order in range(1,dependency+1):
154 |         features.append(np.reshape(holder,[n,-1]))
155 |         exp = np.expand_dims(exp,-1)
156 |         holder = exp * np.expand_dims(holder,1)
157 |     return np.concatenate(features,axis=-1)  
158 | 
159 | 
160 | def label_generator(problem, X, param, difficulty=1, beta=None, important=None):
161 |         
162 |     if important is None or important > X.shape[-1]:
163 |         important = X.shape[-1]
164 |     dim_latent = sum([important**i for i in range(1, difficulty+1)])
165 |     if beta is None:
166 |         beta = np.random.normal(size=[1, dim_latent])
167 |     important_dims = np.random.choice(X.shape[-1], important, replace=False)
168 |     funct_init = lambda inp: np.sum(beta * generate_features(inp[:,important_dims], difficulty), -1)
169 |     batch_size = max(100, min(len(X), 10000000//dim_latent))
170 |     y_true = np.zeros(len(X))
171 |     while True:
172 |         try:
173 |             for itr in range(int(np.ceil(len(X)/batch_size))):
174 |                 y_true[itr * batch_size: (itr+1) * batch_size] = funct_init(
175 |                     X[itr * batch_size: (itr+1) * batch_size])
176 |             break
177 |         except MemoryError:
178 |             batch_size = batch_size//2
179 |     mean, std = np.mean(y_true), np.std(y_true)
180 |     funct = lambda x: (np.sum(beta * generate_features(
181 |         x[:, important_dims], difficulty), -1) - mean) / std
182 |     y_true = (y_true - mean)/std
183 |     if problem is 'classification':
184 |         y_true = logistic.cdf(param * y_true)
185 |         y = (np.random.random(X.shape[0]) < y_true).astype(int)
186 |     elif problem is 'regression':
187 |         y = y_true + param * np.random.normal(size=len(y_true))
188 |     else:
189 |         raise ValueError('Invalid problem specified!')
190 |     return beta, y, y_true, funct
191 | 
192 | 
193 | def one_iteration(clf, X, y, X_test, y_test, mean_score, tol=0.0, c=None, metric='accuracy'):
194 |     """Runs one iteration of TMC-Shapley."""
195 |     
196 |     if metric == 'auc':
197 |         def score_func(clf, a, b):
198 |             return roc_auc_score(b, clf.predict_proba(a)[:,1])
199 |     elif metric == 'accuracy':
200 |         def score_func(clf, a, b):
201 |             return clf.score(a, b)
202 |     else:
203 |         raise ValueError("Wrong metric!")  
204 |     if c is None:
205 |         c = {i:np.array([i]) for i in range(len(X))}
206 |     idxs, marginal_contribs = np.random.permutation(len(c.keys())), np.zeros(len(X))
207 |     new_score = np.max(np.bincount(y)) * 1./len(y) if np.mean(y//1 == y/1)==1 else 0.
208 |     start = 0
209 |     if start:
210 |         X_batch, y_batch =\
211 |         np.concatenate([X[c[idx]] for idx in idxs[:start]]), np.concatenate([y[c[idx]] for idx in idxs[:start]])
212 |     else:
213 |         X_batch, y_batch = np.zeros((0,) +  tuple(X.shape[1:])), np.zeros(0).astype(int)
214 |     for n, idx in enumerate(idxs[start:]):
215 |         try:
216 |             clf = clone(clf)
217 |         except:
218 |             clf.fit(np.zeros((0,) +  X.shape[1:]), y)
219 |         old_score = new_score
220 |         X_batch, y_batch = np.concatenate([X_batch, X[c[idx]]]), np.concatenate([y_batch, y[c[idx]]])
221 |         with warnings.catch_warnings():
222 |             warnings.simplefilter("ignore")
223 |             try:
224 |                 clf.fit(X_batch, y_batch)
225 |                 temp_score = score_func(clf, X_test, y_test)
226 |                 if temp_score>-1 and temp_score<1.: #Removing measningless r2 scores
227 |                     new_score = temp_score
228 |             except:
229 |                 continue
230 |         marginal_contribs[c[idx]] = (new_score - old_score)/len(c[idx])
231 |         if np.abs(new_score - mean_score)/mean_score < tol:
232 |             break
233 |     return marginal_contribs, idxs
234 | 
235 | 
236 | def marginals(clf, X, y, X_test, y_test, c=None, tol=0., trials=3000, mean_score=None, metric='accuracy'):
237 |     
238 |     if metric == 'auc':
239 |         def score_func(clf, a, b):
240 |             return roc_auc_score(b, clf.predict_proba(a)[:,1])
241 |     elif metric == 'accuracy':
242 |         def score_func(clf, a, b):
243 |             return clf.score(a, b)
244 |     else:
245 |         raise ValueError("Wrong metric!")  
246 |     if mean_score is None:
247 |         accs = []
248 |         for _ in range(100):
249 |             bag_idxs = np.random.choice(len(y_test), len(y_test))
250 |             accs.append(score_func(clf, X_test[bag_idxs], y_test[bag_idxs]))
251 |         mean_score = np.mean(accs)
252 |     marginals, idxs = [], []
253 |     for trial in range(trials):
254 |         if 10*(trial+1)/trials % 1 == 0:
255 |             print('{} out of {}'.format(trial + 1, trials))
256 |         marginal, idx = one_iteration(clf, X, y, X_test, y_test, mean_score, tol=tol, c=c, metric=metric)
257 |         marginals.append(marginal)
258 |         idxs.append(idx)
259 |     return np.array(marginals), np.array(idxs)
260 | 
261 | def shapley(mode, X, y, X_test, y_test, stop=None, tol=0., trials=3000, **kwargs):
262 |     
263 |     try:
264 |         vals = np.zeros(len(X))
265 |         example_idxs = np.random.choice(len(X), min(25, len(X)), replace=False)
266 |         example_marginals = np.zeros((trials, len(example_idxs)))
267 |         for i in range(trials):
268 |             print(i)
269 |             output = one_pass(mode, X, y, X_test, y_test, tol=tol, stop=stop, **kwargs)
270 |             example_marginals[i] = output[0][example_idxs]
271 |             vals = vals/(i+1) + output[0]/(i+1)
272 |         return vals, example_marginals
273 |     except KeyboardInterrupt:
274 |         print('Interrupted!')
275 |         return vals, example_marginals
276 | 
277 | def early_stopping(marginals, idxs, stopping):
278 |     
279 |     stopped_marginals = np.zeros_like(marginals)
280 |     for i in range(len(marginals)):
281 |         stopped_marginals[i][idxs[i][:stopping]] = marginals[i][idxs[i][:stopping]]
282 |     return np.mean(stopped_marginals, 0)
283 | 
284 | def error(mem):
285 |         
286 |     if len(mem) < 100:
287 |         return 1.0
288 |     all_vals = (np.cumsum(mem, 0)/np.reshape(np.arange(1, len(mem)+1), (-1,1)))[-100:]
289 |     errors = np.mean(np.abs(all_vals[-100:] - all_vals[-1:])/(np.abs(all_vals[-1:]) + 1e-12), -1)
290 |     return np.max(errors)
291 | 
292 | def my_accuracy_score(clf, X, y):
293 |     
294 |     probs = clf.predict_proba(X)
295 |     predictions = np.argmax(probs, -1)
296 |     return np.mean(np.equal(predictions, y))
297 | 
298 | def my_f1_score(clf, X, y):
299 |     
300 |     predictions = clf.predict(x)
301 |     if len(set(y)) == 2:
302 |         return f1_score(y, predictions)
303 |     return f1_score(y, predictions, average='macro')
304 | 
305 | def my_auc_score(clf, X, y):
306 |     
307 |     probs = clf.predict_proba(X)
308 |     true_probs = probs[np.arange(len(y)), y]
309 |     return roc_auc_score(y, true_probs)
310 | 
311 | def my_xe_score(clf, X, y):
312 |     
313 |     probs = clf.predict_proba(X)
314 |     true_probs = probs[np.arange(len(y)), y]
315 |     true_log_probs = np.log(np.clip(true_probs, 1e-12, None))
316 |     return np.mean(true_log_probs)
317 |     


--------------------------------------------------------------------------------
/use_case/DataAcquisition/shap_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | from scipy.stats import logistic
  5 | from scipy.stats import spearmanr
  6 | from sklearn.naive_bayes import MultinomialNB
  7 | from sklearn.linear_model import LogisticRegression
  8 | from sklearn.linear_model import LinearRegression, Ridge
  9 | from sklearn.metrics import r2_score
 10 | from sklearn.neural_network import MLPRegressor, MLPClassifier
 11 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
 12 | from sklearn.ensemble import RandomForestRegressor
 13 | from sklearn.neighbors import KNeighborsClassifier
 14 | from sklearn.tree import DecisionTreeClassifier
 15 | from sklearn.naive_bayes import MultinomialNB, GaussianNB
 16 | from sklearn.gaussian_process import GaussianProcessClassifier
 17 | from sklearn.svm import SVC, LinearSVC
 18 | from sklearn import svm
 19 | from sklearn.base import clone
 20 | from Shapley import ShapNN, CShapNN
 21 | from multiprocessing import dummy as multiprocessing
 22 | from sklearn.metrics import roc_auc_score, f1_score
 23 | import warnings
 24 | import tensorflow as tf
 25 | import matplotlib.pyplot as plt
 26 |         
 27 | def convergence_plots(marginals):
 28 |     
 29 |     plt.figure(figsize=(20,20))
 30 | #     plt.rcParams['figure.figsize'] = 15,15
 31 |     for i, idx in enumerate(np.arange(min(25, marginals.shape[-1]))):
 32 |         plt.subplot(5,5,i+1)
 33 |         plt.plot(np.cumsum(marginals[:, idx])/np.arange(1, len(marginals)+1)) 
 34 |     plt.savefig('temp.png')
 35 |     
 36 | def is_integer(array):
 37 |     return (np.equal(np.mod(array, 1), 0).mean()==1)
 38 | 
 39 | 
 40 | def is_fitted(model):
 41 |         """Checks if model object has any attributes ending with an underscore"""
 42 |         return 0 < len( [k for k,v in inspect.getmembers(model) if k.endswith('_') and not k.startswith('__')] )
 43 | 
 44 | 
 45 | def return_model(mode, **kwargs):
 46 |     if mode=='logistic':
 47 |         solver = kwargs.get('solver', 'liblinear')
 48 |         n_jobs = kwargs.get('n_jobs', None)
 49 |         max_iter = kwargs.get('max_iter', 5000)
 50 |         model = LogisticRegression(solver=solver, n_jobs=n_jobs, 
 51 |                                  max_iter=max_iter, random_state=666, multi_class='auto')
 52 |     elif mode=='Tree':
 53 |         model = DecisionTreeClassifier(random_state=666)
 54 |     elif mode=='RandomForest':
 55 |         n_estimators = kwargs.get('n_estimators', 25)
 56 |         model = RandomForestClassifier(n_estimators=n_estimators, random_state=666)
 57 |     elif mode=='RandomForestReg':
 58 |         n_estimators = kwargs.get('n_estimators', 50)
 59 | #         model = RandomForestRegressor(max_depth=4, n_estimators=n_estimators, random_state=666)
 60 | #         n_estimators = kwargs.get('n_estimators', 50)
 61 |         model = RandomForestRegressor(max_depth=100, n_estimators=n_estimators, random_state=666)#, min_samples_split = 0.05, min_samples_leaf = 0.001)
 62 | #         model = RandomForestClassifier(n_estimators=100, criterion = 'gini', max_features = None,  min_samples_split = 0.05, min_samples_leaf = 0.001)
 63 |     elif mode == 'mlpreg':
 64 |         model = MLPRegressor(hidden_layer_sizes=(10,100),  activation='relu', solver='adam', alpha=0.001,batch_size='auto',
 65 |                learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
 66 |                random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
 67 |                nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
 68 |                epsilon=1e-08)
 69 |         
 70 |     elif mode=='GB':
 71 |         n_estimators = kwargs.get('n_estimators', 50)
 72 |         model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666)
 73 |     elif mode=='AdaBoost':
 74 |         n_estimators = kwargs.get('n_estimators', 50)
 75 |         model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666)
 76 |     elif mode=='SVC':
 77 |         kernel = kwargs.get('kernel', 'rbf')
 78 |         model = SVC(kernel=kernel, random_state=666)
 79 |     elif mode=='LinearSVC':
 80 |         model = LinearSVC(loss='hinge', random_state=666)
 81 |     elif mode=='GP':
 82 |         model = GaussianProcessClassifier(random_state=666)
 83 |     elif mode=='KNN':
 84 |         n_neighbors = kwargs.get('n_neighbors', 5)
 85 |         model = KNeighborsClassifier(n_neighbors=n_neighbors)
 86 |     elif mode=='NB':
 87 |         model = MultinomialNB()
 88 |     elif mode=='linear':
 89 |         model = LinearRegression(random_state=666)
 90 |     elif mode=='ridge':
 91 |         alpha = kwargs.get('alpha', 1.0)
 92 |         model = Ridge(alpha=alpha, random_state=666)
 93 |     elif mode=='uci':
 94 |         model = MLPClassifier(activation = 'logistic', solver='lbfgs', 
 95 |                     alpha=1e-4, hidden_layer_sizes=(6, 100), early_stopping=False,
 96 |                     max_iter= 5000,
 97 |                     random_state=666, warm_start = False)
 98 |     elif 'conv' in mode:
 99 |         tf.reset_default_graph()
100 |         address = kwargs.get('address', 'weights/conv')
101 |         hidden_units = kwargs.get('hidden_layer_sizes', [20])
102 |         activation = kwargs.get('activation', 'relu')
103 |         weight_decay = kwargs.get('weight_decay', 1e-4)
104 |         learning_rate = kwargs.get('learning_rate', 0.001)
105 |         max_iter = kwargs.get('max_iter', 1000)
106 |         early_stopping= kwargs.get('early_stopping', 10)
107 |         warm_start = kwargs.get('warm_start', False)
108 |         batch_size = kwargs.get('batch_size', 256)
109 |         kernel_sizes = kwargs.get('kernel_sizes', [5])
110 |         strides = kwargs.get('strides', [5])
111 |         channels = kwargs.get('channels', [1])
112 |         validation_fraction = kwargs.get('validation_fraction', 0.)
113 |         global_averaging = kwargs.get('global_averaging', 0.)
114 |         optimizer = kwargs.get('optimizer', 'sgd')
115 |         if mode=='conv':
116 |             model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter,
117 |                           learning_rate=learning_rate, 
118 |                           weight_decay=weight_decay, validation_fraction=validation_fraction,
119 |                           early_stopping=early_stopping,
120 |                          optimizer=optimizer, warm_start=warm_start, address=address,
121 |                           hidden_units=hidden_units,
122 |                           strides=strides, global_averaging=global_averaging,
123 |                          kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
124 |         elif mode=='conv_reg':
125 |             model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter,
126 |                           learning_rate=learning_rate, 
127 |                           weight_decay=weight_decay, validation_fraction=validation_fraction,
128 |                           early_stopping=early_stopping,
129 |                          optimizer=optimizer, warm_start=warm_start, address=address,
130 |                           hidden_units=hidden_units,
131 |                           strides=strides, global_averaging=global_averaging,
132 |                          kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
133 |     elif 'NN' in mode:
134 |         solver = kwargs.get('solver', 'adam')
135 |         hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,))
136 |         if isinstance(hidden_layer_sizes, list):
137 |             hidden_layer_sizes = list(hidden_layer_sizes)
138 |         activation = kwargs.get('activation', 'relu')
139 |         learning_rate_init = kwargs.get('learning_rate', 0.001)
140 |         max_iter = kwargs.get('max_iter', 5000)
141 |         early_stopping= kwargs.get('early_stopping', False)
142 |         warm_start = kwargs.get('warm_start', False)
143 |         if mode=='NN':
144 |             model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
145 |                                 activation=activation, learning_rate_init=learning_rate_init,
146 |                                 warm_start = warm_start, max_iter=max_iter, verbose=False,
147 |                                 early_stopping=early_stopping)
148 |         if mode=='NN_reg':
149 |             model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
150 |                                 activation=activation, learning_rate_init=learning_rate_init,
151 |                                 warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping)
152 |     else:
153 |         raise ValueError("Invalid mode!")
154 |     return model
155 | 
156 | 
157 | 
158 | def generate_features(latent, dependency):
159 | 
160 |     features = []
161 |     n = latent.shape[0]
162 |     exp = latent
163 |     holder = latent
164 |     for order in range(1,dependency+1):
165 |         features.append(np.reshape(holder,[n,-1]))
166 |         exp = np.expand_dims(exp,-1)
167 |         holder = exp * np.expand_dims(holder,1)
168 |     return np.concatenate(features,axis=-1)  
169 | 
170 | 
171 | def label_generator(problem, X, param, difficulty=1, beta=None, important=None):
172 |         
173 |     if important is None or important > X.shape[-1]:
174 |         important = X.shape[-1]
175 |     dim_latent = sum([important**i for i in range(1, difficulty+1)])
176 |     if beta is None:
177 |         beta = np.random.normal(size=[1, dim_latent])
178 |     important_dims = np.random.choice(X.shape[-1], important, replace=False)
179 |     funct_init = lambda inp: np.sum(beta * generate_features(inp[:,important_dims], difficulty), -1)
180 |     batch_size = max(100, min(len(X), 10000000//dim_latent))
181 |     y_true = np.zeros(len(X))
182 |     while True:
183 |         try:
184 |             for itr in range(int(np.ceil(len(X)/batch_size))):
185 |                 y_true[itr * batch_size: (itr+1) * batch_size] = funct_init(
186 |                     X[itr * batch_size: (itr+1) * batch_size])
187 |             break
188 |         except MemoryError:
189 |             batch_size = batch_size//2
190 | #     print(y_true[:10])
191 |     mean, std = np.mean(y_true), np.std(y_true)
192 |     funct = lambda x: (np.sum(beta * generate_features(
193 |         x[:, important_dims], difficulty), -1) - mean) / std
194 |     y_true = (y_true - mean)/std    
195 |     if problem is 'classification':
196 |         y_true = logistic.cdf(param * y_true)
197 |         y = (np.random.random(X.shape[0]) < y_true).astype(int)
198 |     elif problem is 'regression':
199 |         y = y_true + param * np.random.normal(size=len(y_true))
200 |     else:
201 |         raise ValueError('Invalid problem specified!')
202 | #     print("beta\ty\t\t_true\tfunct")
203 | #     print(beta,y[:10],y_true[:10],funct)
204 |     return beta, y, y_true, funct
205 | 
206 | 
207 | def one_iteration(clf, X, y, X_test, y_test, mean_score, tol=0.0, c=None, metric='accuracy'):
208 |     """Runs one iteration of TMC-Shapley."""
209 |     
210 |     if metric == 'auc':
211 |         def score_func(clf, a, b):
212 |             return roc_auc_score(b, clf.predict_proba(a)[:,1])
213 |     elif metric == 'accuracy':
214 |         def score_func(clf, a, b):
215 |             return clf.score(a, b)
216 |     else:
217 |         raise ValueError("Wrong metric!")  
218 |     if c is None:
219 |         c = {i:np.array([i]) for i in range(len(X))}
220 |     idxs, marginal_contribs = np.random.permutation(len(c.keys())), np.zeros(len(X))
221 |     new_score = np.max(np.bincount(y)) * 1./len(y) if np.mean(y//1 == y/1)==1 else 0.
222 |     start = 0
223 |     if start:
224 |         X_batch, y_batch =\
225 |         np.concatenate([X[c[idx]] for idx in idxs[:start]]), np.concatenate([y[c[idx]] for idx in idxs[:start]])
226 |     else:
227 |         X_batch, y_batch = np.zeros((0,) +  tuple(X.shape[1:])), np.zeros(0).astype(int)
228 |     for n, idx in enumerate(idxs[start:]):
229 |         try:
230 |             clf = clone(clf)
231 |         except:
232 |             clf.fit(np.zeros((0,) +  X.shape[1:]), y)
233 |         old_score = new_score
234 |         X_batch, y_batch = np.concatenate([X_batch, X[c[idx]]]), np.concatenate([y_batch, y[c[idx]]])
235 |         with warnings.catch_warnings():
236 |             warnings.simplefilter("ignore")
237 |             try:
238 |                 clf.fit(X_batch, y_batch)
239 |                 temp_score = score_func(clf, X_test, y_test)
240 |                 if temp_score>-1 and temp_score<1.: #Removing measningless r2 scores
241 |                     new_score = temp_score
242 |             except:
243 |                 continue
244 |         marginal_contribs[c[idx]] = (new_score - old_score)/len(c[idx])
245 |         if np.abs(new_score - mean_score)/mean_score < tol:
246 |             break
247 |     return marginal_contribs, idxs
248 | 
249 | 
250 | def marginals(clf, X, y, X_test, y_test, c=None, tol=0., trials=3000, mean_score=None, metric='accuracy'):
251 |     
252 |     if metric == 'auc':
253 |         def score_func(clf, a, b):
254 |             return roc_auc_score(b, clf.predict_proba(a)[:,1])
255 |     elif metric == 'accuracy':
256 |         def score_func(clf, a, b):
257 |             return clf.score(a, b)
258 |     else:
259 |         raise ValueError("Wrong metric!")  
260 |     if mean_score is None:
261 |         accs = []
262 |         for _ in range(100):
263 |             bag_idxs = np.random.choice(len(y_test), len(y_test))
264 |             accs.append(score_func(clf, X_test[bag_idxs], y_test[bag_idxs]))
265 |         mean_score = np.mean(accs)
266 |     marginals, idxs = [], []
267 |     for trial in range(trials):
268 |         if 10*(trial+1)/trials % 1 == 0:
269 |             print('{} out of {}'.format(trial + 1, trials))
270 |         marginal, idx = one_iteration(clf, X, y, X_test, y_test, mean_score, tol=tol, c=c, metric=metric)
271 |         marginals.append(marginal)
272 |         idxs.append(idx)
273 |     return np.array(marginals), np.array(idxs)
274 | 
275 | def shapley(mode, X, y, X_test, y_test, stop=None, tol=0., trials=3000, **kwargs):
276 |     
277 |     try:
278 |         vals = np.zeros(len(X))
279 |         example_idxs = np.random.choice(len(X), min(25, len(X)), replace=False)
280 |         example_marginals = np.zeros((trials, len(example_idxs)))
281 |         for i in range(trials):
282 |             print(i)
283 |             output = one_pass(mode, X, y, X_test, y_test, tol=tol, stop=stop, **kwargs)
284 |             example_marginals[i] = output[0][example_idxs]
285 |             vals = vals/(i+1) + output[0]/(i+1)
286 |         return vals, example_marginals
287 |     except KeyboardInterrupt:
288 |         print('Interrupted!')
289 |         return vals, example_marginals
290 | 
291 | def early_stopping(marginals, idxs, stopping):
292 |     
293 |     stopped_marginals = np.zeros_like(marginals)
294 |     for i in range(len(marginals)):
295 |         stopped_marginals[i][idxs[i][:stopping]] = marginals[i][idxs[i][:stopping]]
296 |     return np.mean(stopped_marginals, 0)
297 | 
298 | def error(mem):
299 |     
300 |     if len(mem) < 10: # why it uses 100? for the number of train size is equal to 100?
301 |         return 1.0
302 |     all_vals = (np.cumsum(mem, 0)/np.reshape(np.arange(1, len(mem)+1), (-1,1)))[-100:]
303 |     errors = np.mean(np.abs(all_vals[-100:] - all_vals[-1:])/(np.abs(all_vals[-1:]) + 1e-12), -1) # confirm the variation of the shapley value converge towards to a small number(error)
304 |     return np.max(errors)
305 | 
306 | def my_accuracy_score(clf, X, y):
307 |     
308 |     probs = clf.predict_proba(X)
309 |     predictions = np.argmax(probs, -1)
310 |     return np.mean(np.equal(predictions, y))
311 | 
312 | def my_f1_score(clf, X, y):
313 |     
314 |     predictions = clf.predict(x)
315 |     if len(set(y)) == 2:
316 |         return f1_score(y, predictions)
317 |     return f1_score(y, predictions, average='macro')
318 | 
319 | def my_auc_score(clf, X, y):
320 |     
321 |     probs = clf.predict_proba(X)
322 |     true_probs = probs[np.arange(len(y)), y]
323 |     return roc_auc_score(y, true_probs)
324 | 
325 | def my_xe_score(clf, X, y):
326 |     
327 |     probs = clf.predict_proba(X)
328 |     true_probs = probs[np.arange(len(y)), y]
329 |     true_log_probs = np.log(np.clip(true_probs, 1e-12, None))
330 |     return np.mean(true_log_probs)
331 |     


--------------------------------------------------------------------------------
/reproduction/YFCC100M/testlsh.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import bz2\n",
 10 |     "import numpy as np\n",
 11 |     "from tqdm import tqdm_notebook as tqdm\n",
 12 |     "import gzip\n",
 13 |     "from heapq import heappushpop\n",
 14 |     "from joblib import Parallel, delayed\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import joblib\n",
 25 |     "x_trn_hash = joblib.load('10M/eps0.1/x_trn_hash.pkl')\n",
 26 |     "w = joblib.load('10M/eps0.1/w.pkl')\n",
 27 |     "b = joblib.load('10M/eps0.1/b.pkl')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "y_trn = joblib.load('y_trn.pkl')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "346.42120146751404\n",
 49 |       "336.7830514907837\n",
 50 |       "335.7016701698303\n",
 51 |       "307.22366166114807\n",
 52 |       "290.6147196292877\n",
 53 |       "315.5828514099121\n",
 54 |       "344.08966970443726\n",
 55 |       "337.0721056461334\n",
 56 |       "360.9251072406769\n",
 57 |       "421.92287850379944\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "dataset_ids = []\n",
 63 |     "dataset_vals = []\n",
 64 |     "for data_id in range(10):\n",
 65 |     "    st = time.time()\n",
 66 |     "    dataset_val = np.load('x_trn_' + str(data_id) + '.npy')\n",
 67 |     "    dataset_vals.append(dataset_val)\n",
 68 |     "    print(time.time() - st)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 6,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "class X:\n",
 78 |     "    def __init__(self, data, offset):\n",
 79 |     "        self.data = data\n",
 80 |     "        self.offset = offset\n",
 81 |     "    \n",
 82 |     "    def __getitem__(self, key):\n",
 83 |     "        index1 = (key + self.offset) // 1000000\n",
 84 |     "        index2 = (key + self.offset) % 1000000\n",
 85 |     "        return self.data[index1][index2]\n",
 86 |     "    \n",
 87 |     "    def __len__(self):\n",
 88 |     "        l = 0\n",
 89 |     "        for x in self.data:\n",
 90 |     "            l += len(x)\n",
 91 |     "        return l - self.offset"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 8,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "x_trn = X(dataset_vals, 1100)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 11,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "y_tst = np.load('y_tst.npy')\n",
110 |     "y_val = np.load('y_val.npy')\n",
111 |     "x_tst = np.load('x_tst.npy')\n",
112 |     "x_val = np.load('x_val.npy')"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 12,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "x_val2 = x_val[:100]\n",
122 |     "y_val2 = y_val[:100]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 13,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "def val_error(K, sp_gt):\n",
132 |     "    K_star = 10\n",
133 |     "    start = time.time()\n",
134 |     "    x_val_knn_approx, nns_vec = lsh.get_approx_KNN(x_val2, K_star)\n",
135 |     "    runtime_query = time.time() - start\n",
136 |     "    print(runtime_query)\n",
137 |     "    \n",
138 |     "    start = time.time()\n",
139 |     "    sp_approx = lsh.compute_approx_shapley(x_val_knn_approx, y_val2, K)\n",
140 |     "    runtime_approx_value = time.time() - start\n",
141 |     "    print('it takes %s to get appox knn value' % runtime_approx_value)\n",
142 |     "    \n",
143 |     "    sp_err_inf_val= np.linalg.norm(sp_gt - sp_approx,ord=np.inf, axis=1)\n",
144 |     "    print('max error %s'% np.percentile(sp_err_inf_val,90))\n",
145 |     "    return sp_approx"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 57,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "def test_error(K, sp_gt):\n",
155 |     "    K_star = 10\n",
156 |     "    start = time.time()\n",
157 |     "    x_tst_knn_approx, nns_vec = lsh.get_approx_KNN(x_tst, K_star)\n",
158 |     "    runtime_query = time.time() - start\n",
159 |     "    print(runtime_query)\n",
160 |     "    \n",
161 |     "    start = time.time()\n",
162 |     "    sp_approx = lsh.compute_approx_shapley(x_tst_knn_approx, y_tst, K)\n",
163 |     "    runtime_approx_value = time.time() - start\n",
164 |     "    print('it takes %s to get appox knn value' % runtime_approx_value)\n",
165 |     "    \n",
166 |     "    sp_err_inf_val= np.linalg.norm(sp_gt - sp_approx,ord=np.inf, axis=1)\n",
167 |     "    print('max error %s'% np.percentile(sp_err_inf_val,90))\n",
168 |     "    return sp_approx"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 15,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "dist_rand = np.load('10M/eps0.1/dist_rand.npy')\n",
178 |     "dist_rand = np.mean(dist_rand, axis=0)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 16,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "sp_gt2 = np.load('10M/eps0.1/sp_gt2.npy')"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 56,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "def equal(a, b):\n",
197 |     "    try:\n",
198 |     "        return not set.isdisjoint(a, b)\n",
199 |     "    except KeyError:\n",
200 |     "        return 0"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 52,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "import numpy as np\n",
210 |     "import pdb\n",
211 |     "\n",
212 |     "\n",
213 |     "def lsh_function(t,x,w,b):\n",
214 |     "    # x is 1-d array\n",
215 |     "    h = np.floor((np.dot(w,x)+b)/t).astype(int)\n",
216 |     "    return h\n",
217 |     "\n",
218 |     "\n",
219 |     "class LSH:\n",
220 |     "    def __init__(self,n_hash_bit,n_hash_table,x_trn,y_trn,t=0.1):\n",
221 |     "        self.n_hash_bit = n_hash_bit\n",
222 |     "        self.n_hash_table = n_hash_table\n",
223 |     "        self.t = t # width of projections\n",
224 |     "        self.x_trn = x_trn\n",
225 |     "        self.y_trn = y_trn\n",
226 |     "        self.N = len(x_trn)\n",
227 |     "        self.dim = 4096\n",
228 |     "        # draw w from a normal distribution (2-stable)\n",
229 |     "        self.w = np.random.normal(0, 1, (n_hash_table, n_hash_bit, self.dim))\n",
230 |     "        # draw b from U[0,t]\n",
231 |     "        self.b = np.random.uniform(0, self.t, (n_hash_table, n_hash_bit))\n",
232 |     "        self.x_trn_hash = [dict() for i in range(n_hash_table)]\n",
233 |     "#         for i in tqdm(range(self.N)):\n",
234 |     "#             hash_code_all = lsh_function(self.t, x_trn[i] / dist_rand, self.w, self.b)\n",
235 |     "#             for l in range(n_hash_table):\n",
236 |     "#                 hash_code_trn = '.'.join(map(str, hash_code_all[l, :]))\n",
237 |     "#                 if hash_code_trn in self.x_trn_hash[l].keys():\n",
238 |     "#                     self.x_trn_hash[l][hash_code_trn].append(i)\n",
239 |     "#                 else:\n",
240 |     "#                     self.x_trn_hash[l][hash_code_trn] = [i]\n",
241 |     "#             if i % 1000 == 0:\n",
242 |     "#                 print('build hash %s'%i)\n",
243 |     "\n",
244 |     "    def get_approx_KNN(self,x_tst,K):\n",
245 |     "        N_tst = x_tst.shape[0]\n",
246 |     "        x_tst_knn = np.ones((N_tst, K)) * (-1)\n",
247 |     "        nns_len = np.zeros(N_tst)\n",
248 |     "        for i_tst in tqdm(range(N_tst)):\n",
249 |     "            nns = np.array([])\n",
250 |     "            for l in range(self.n_hash_table):\n",
251 |     "                hash_code_int = lsh_function(self.t, x_tst[i_tst] / dist_rand, self.w[l, :, :], self.b[l, :])\n",
252 |     "                hash_code_test = '.'.join(map(str, hash_code_int))\n",
253 |     "                if hash_code_test in self.x_trn_hash[l].keys():\n",
254 |     "                    nns = np.append(nns, self.x_trn_hash[l][hash_code_test])\n",
255 |     "            nns = np.unique(nns)\n",
256 |     "            nns = nns.astype(int)\n",
257 |     "            num_collide_elements = len(nns)\n",
258 |     "            if len(nns) > 0:\n",
259 |     "                dist = [np.linalg.norm(self.x_trn[i] / dist_rand - x_tst[i_tst] / dist_rand, 2) for i in nns]\n",
260 |     "                dist_min_ind = nns[np.argsort(dist)]\n",
261 |     "                if num_collide_elements < K:\n",
262 |     "                    x_tst_knn[i_tst, :num_collide_elements] = dist_min_ind[:num_collide_elements]\n",
263 |     "                else:\n",
264 |     "                    x_tst_knn[i_tst, :] = dist_min_ind[:K]\n",
265 |     "            # pdb.set_trace()\n",
266 |     "            nns_len[i_tst] = len(nns)\n",
267 |     "            if i_tst % 100 == 0:\n",
268 |     "                print('get approximate knn %s'%i_tst)\n",
269 |     "        return x_tst_knn.astype(int),nns_len\n",
270 |     "\n",
271 |     "\n",
272 |     "    def compute_approx_shapley(self,x_tst_knn,y_tst,K):\n",
273 |     "        N_tst,K_star = x_tst_knn.shape\n",
274 |     "        # flag_sufficient = (x_tst_knn[:,-1]>=0)\n",
275 |     "        sp_approx = np.zeros((N_tst,self.N))\n",
276 |     "        for j in tqdm(range(N_tst)):\n",
277 |     "            non_nan_index = np.where(x_tst_knn[j,:]>=0)[0]\n",
278 |     "            if len(non_nan_index)== 0:\n",
279 |     "                continue\n",
280 |     "            K_tot = non_nan_index[-1]\n",
281 |     "            if K_tot == self.N:\n",
282 |     "                sp_approx[j, x_tst_knn[j, self.N - 1]] = equal(self.y_trn[x_tst_knn[j, self.N - 1]], y_tst[j]) / self.N\n",
283 |     "            for i in np.arange(K_tot - 1, -1, -1):\n",
284 |     "                sp_approx[j, x_tst_knn[j, i]] = sp_approx[j, x_tst_knn[j, i+1]] + (\n",
285 |     "                        equal(self.y_trn[x_tst_knn[j, i]], y_tst[j]) - equal(\n",
286 |     "                    self.y_trn[x_tst_knn[j, i + 1]], y_tst[j])) / K * min([K, i + 1]) / (i + 1)\n",
287 |     "\n",
288 |     "\n",
289 |     "\n",
290 |     "        return sp_approx"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 53,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "lsh = LSH(14,75,x_trn,y_trn,t=2.203)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 61,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "lsh.x_trn_hash = x_trn_hash\n",
309 |     "lsh.w = w\n",
310 |     "lsh.b = b"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 63,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "application/vnd.jupyter.widget-view+json": {
321 |        "model_id": "e141ac778f1544ab8cc30572bf85e94d",
322 |        "version_major": 2,
323 |        "version_minor": 0
324 |       },
325 |       "text/plain": [
326 |        "HBox(children=(IntProgress(value=0), HTML(value='')))"
327 |       ]
328 |      },
329 |      "metadata": {},
330 |      "output_type": "display_data"
331 |     },
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "get approximate knn 0\n",
337 |       "\n",
338 |       "3972.0105855464935\n"
339 |      ]
340 |     },
341 |     {
342 |      "data": {
343 |       "application/vnd.jupyter.widget-view+json": {
344 |        "model_id": "84515c129b3047b0bc94a3610e579a00",
345 |        "version_major": 2,
346 |        "version_minor": 0
347 |       },
348 |       "text/plain": [
349 |        "HBox(children=(IntProgress(value=0), HTML(value='')))"
350 |       ]
351 |      },
352 |      "metadata": {},
353 |      "output_type": "display_data"
354 |     },
355 |     {
356 |      "name": "stdout",
357 |      "output_type": "stream",
358 |      "text": [
359 |       "\n",
360 |       "it takes 0.09952259063720703 to get appox knn value\n",
361 |       "max error 0.09141423452978795\n"
362 |      ]
363 |     }
364 |    ],
365 |    "source": [
366 |     "sp_gt2_approx = test_error(2, sp_gt2)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 59,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "sp_gt = np.load('10M/eps0.1/sp_gt.npy')"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 62,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "data": {
385 |       "application/vnd.jupyter.widget-view+json": {
386 |        "model_id": "a2c40f5e121742eea136f2c9206e6784",
387 |        "version_major": 2,
388 |        "version_minor": 0
389 |       },
390 |       "text/plain": [
391 |        "HBox(children=(IntProgress(value=0), HTML(value='')))"
392 |       ]
393 |      },
394 |      "metadata": {},
395 |      "output_type": "display_data"
396 |     },
397 |     {
398 |      "name": "stdout",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "get approximate knn 0\n",
402 |       "\n",
403 |       "4183.600741863251\n"
404 |      ]
405 |     },
406 |     {
407 |      "data": {
408 |       "application/vnd.jupyter.widget-view+json": {
409 |        "model_id": "d3c8e49679f1428a9e754b1c87846b58",
410 |        "version_major": 2,
411 |        "version_minor": 0
412 |       },
413 |       "text/plain": [
414 |        "HBox(children=(IntProgress(value=0), HTML(value='')))"
415 |       ]
416 |      },
417 |      "metadata": {},
418 |      "output_type": "display_data"
419 |     },
420 |     {
421 |      "name": "stdout",
422 |      "output_type": "stream",
423 |      "text": [
424 |       "\n",
425 |       "it takes 0.07110762596130371 to get appox knn value\n",
426 |       "max error 0.09141423452978795\n"
427 |      ]
428 |     },
429 |     {
430 |      "data": {
431 |       "text/plain": [
432 |        "array([[0., 0., 0., ..., 0., 0., 0.],\n",
433 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
434 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
435 |        "       ...,\n",
436 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
437 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
438 |        "       [0., 0., 0., ..., 0., 0., 0.]])"
439 |       ]
440 |      },
441 |      "execution_count": 62,
442 |      "metadata": {},
443 |      "output_type": "execute_result"
444 |     }
445 |    ],
446 |    "source": [
447 |     "test_error(1, sp_gt)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 29,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "for i in range(75):\n",
457 |     "    assert sum([len(v) for k, v in lsh.x_trn_hash[i].items()]) == 9998900 "
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": 30,
463 |    "metadata": {},
464 |    "outputs": [
465 |     {
466 |      "name": "stdout",
467 |      "output_type": "stream",
468 |      "text": [
469 |       "-1.0.0.0.0.0.0.-1.-1.-2.0.0.0.1 [ 325606 3817062 8955573 9021150]\n"
470 |      ]
471 |     }
472 |    ],
473 |    "source": [
474 |     "for k, v in lsh.x_trn_hash[0].items():\n",
475 |     "    print(k, v)\n",
476 |     "    break"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 64,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "del sp_gt"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 65,
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "sp_gt5 = np.load('10M/eps0.1/sp_gt5.npy')"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 66,
500 |    "metadata": {},
501 |    "outputs": [
502 |     {
503 |      "data": {
504 |       "application/vnd.jupyter.widget-view+json": {
505 |        "model_id": "1a540a12f8104bf3a2736d83ddb6f911",
506 |        "version_major": 2,
507 |        "version_minor": 0
508 |       },
509 |       "text/plain": [
510 |        "HBox(children=(IntProgress(value=0), HTML(value='')))"
511 |       ]
512 |      },
513 |      "metadata": {},
514 |      "output_type": "display_data"
515 |     },
516 |     {
517 |      "name": "stdout",
518 |      "output_type": "stream",
519 |      "text": [
520 |       "get approximate knn 0\n",
521 |       "\n",
522 |       "3920.30832695961\n"
523 |      ]
524 |     },
525 |     {
526 |      "data": {
527 |       "application/vnd.jupyter.widget-view+json": {
528 |        "model_id": "00e25da22dca48f1979c248d99fb9249",
529 |        "version_major": 2,
530 |        "version_minor": 0
531 |       },
532 |       "text/plain": [
533 |        "HBox(children=(IntProgress(value=0), HTML(value='')))"
534 |       ]
535 |      },
536 |      "metadata": {},
537 |      "output_type": "display_data"
538 |     },
539 |     {
540 |      "name": "stdout",
541 |      "output_type": "stream",
542 |      "text": [
543 |       "\n",
544 |       "it takes 0.06524181365966797 to get appox knn value\n",
545 |       "max error 0.09057708217212376\n"
546 |      ]
547 |     },
548 |     {
549 |      "data": {
550 |       "text/plain": [
551 |        "array([[0., 0., 0., ..., 0., 0., 0.],\n",
552 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
553 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
554 |        "       ...,\n",
555 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
556 |        "       [0., 0., 0., ..., 0., 0., 0.],\n",
557 |        "       [0., 0., 0., ..., 0., 0., 0.]])"
558 |       ]
559 |      },
560 |      "execution_count": 66,
561 |      "metadata": {},
562 |      "output_type": "execute_result"
563 |     }
564 |    ],
565 |    "source": [
566 |     "test_error(5, sp_gt5)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 1,
572 |    "metadata": {},
573 |    "outputs": [
574 |     {
575 |      "data": {
576 |       "text/plain": [
577 |        "2"
578 |       ]
579 |      },
580 |      "execution_count": 1,
581 |      "metadata": {},
582 |      "output_type": "execute_result"
583 |     }
584 |    ],
585 |    "source": []
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": []
593 |   }
594 |  ],
595 |  "metadata": {
596 |   "kernelspec": {
597 |    "display_name": "Python 3",
598 |    "language": "python",
599 |    "name": "python3"
600 |   },
601 |   "language_info": {
602 |    "codemirror_mode": {
603 |     "name": "ipython",
604 |     "version": 3
605 |    },
606 |    "file_extension": ".py",
607 |    "mimetype": "text/x-python",
608 |    "name": "python",
609 |    "nbconvert_exporter": "python",
610 |    "pygments_lexer": "ipython3",
611 |    "version": "3.7.0"
612 |   }
613 |  },
614 |  "nbformat": 4,
615 |  "nbformat_minor": 2
616 | }
617 | 


--------------------------------------------------------------------------------
/reproduction/ImageNet/accuracy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import bz2\n",
 10 |     "import numpy as np\n",
 11 |     "from tqdm import tqdm_notebook as tqdm\n",
 12 |     "import gzip\n",
 13 |     "from heapq import heappushpop\n",
 14 |     "from joblib import Parallel, delayed\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 4,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np\n",
 25 |     "import os\n",
 26 |     "\n",
 27 |     "classes = []\n",
 28 |     "\n",
 29 |     "for root, dirs, files in os.walk(\".\"):  \n",
 30 |     "    for filename in files:\n",
 31 |     "        classes.append(filename)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 5,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from re import compile\n",
 41 |     "\n",
 42 |     "rex = compile('n[0-9]+')\n",
 43 |     "classes = [x for x in classes if rex.match(x)]"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 6,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "application/vnd.jupyter.widget-view+json": {
 54 |        "model_id": "e6a4ec53f8b846c8b1f2f1363560bd8e",
 55 |        "version_major": 2,
 56 |        "version_minor": 0
 57 |       },
 58 |       "text/plain": [
 59 |        "HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))"
 60 |       ]
 61 |      },
 62 |      "metadata": {},
 63 |      "output_type": "display_data"
 64 |     },
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "x_trn = []\n",
 75 |     "y_trn = []\n",
 76 |     "for c in tqdm(classes):\n",
 77 |     "    x_trn.append(np.load(c))\n",
 78 |     "    y_trn += [c] * len(x_trn[-1])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 7,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "997659\n",
 91 |       "997659\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "x_trn = np.vstack(x_trn)\n",
 97 |     "print(len(x_trn))\n",
 98 |     "print(len(y_trn))"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 8,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "from sklearn.utils import shuffle\n",
108 |     "x_trn, y_trn = shuffle(x_trn, y_trn, random_state=0)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 9,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "x_trn = np.reshape(x_trn, (-1, 2048))\n",
118 |     "x_tst, y_tst = x_trn[:100], y_trn[:100]\n",
119 |     "x_val, y_val = x_trn[100:1100], y_trn[100:1100]\n",
120 |     "x_trn, y_trn = x_trn[1100:], y_trn[1100:]"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 9,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
132 |        "           metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n",
133 |        "           weights='uniform')"
134 |       ]
135 |      },
136 |      "execution_count": 9,
137 |      "metadata": {},
138 |      "output_type": "execute_result"
139 |     }
140 |    ],
141 |    "source": [
142 |     "from sklearn.neighbors import KNeighborsClassifier\n",
143 |     "neigh = KNeighborsClassifier(n_neighbors=1)\n",
144 |     "neigh.fit(x_trn, y_trn)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 10,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "0.8"
156 |       ]
157 |      },
158 |      "execution_count": 10,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "neigh.score(x_tst, y_tst)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 11,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "0.681"
176 |       ]
177 |      },
178 |      "execution_count": 11,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "neigh.score(x_val, y_val)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 8,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "from sklearn.linear_model import LogisticRegression\n",
194 |     "clf = LogisticRegression(random_state=0, solver='lbfgs',\n",
195 |     "                         multi_class='multinomial', n_jobs=-1).fit(x_trn, y_trn)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 9,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "data": {
205 |       "text/plain": [
206 |        "0.9"
207 |       ]
208 |      },
209 |      "execution_count": 9,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "clf.score(x_tst, y_tst)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "0.81"
227 |       ]
228 |      },
229 |      "execution_count": 10,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "clf.score(x_val, y_val)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 11,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "data": {
245 |       "text/plain": [
246 |        "0.9549760726660439"
247 |       ]
248 |      },
249 |      "execution_count": 11,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "clf.score(x_trn, y_trn)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 12,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "1000"
267 |       ]
268 |      },
269 |      "execution_count": 12,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "len(classes)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 14,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "classes2idx = {}\n",
285 |     "for i, x in enumerate(classes):\n",
286 |     "    classes2idx[x] = i"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 15,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "y_trn_idx = [classes2idx[x] for x in y_trn]"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 17,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "name": "stdout",
305 |      "output_type": "stream",
306 |      "text": [
307 |       "Epoch 1/5\n",
308 |       "996559/996559 [==============================] - 134s 135us/step - loss: 1.2208 - acc: 0.7336\n",
309 |       "Epoch 2/5\n",
310 |       "996559/996559 [==============================] - 123s 123us/step - loss: 0.6757 - acc: 0.8191\n",
311 |       "Epoch 3/5\n",
312 |       "996559/996559 [==============================] - 122s 122us/step - loss: 0.5928 - acc: 0.8383\n",
313 |       "Epoch 4/5\n",
314 |       "996559/996559 [==============================] - 125s 125us/step - loss: 0.5440 - acc: 0.8506\n",
315 |       "Epoch 5/5\n",
316 |       "996559/996559 [==============================] - 122s 122us/step - loss: 0.5092 - acc: 0.8597\n"
317 |      ]
318 |     },
319 |     {
320 |      "data": {
321 |       "text/plain": [
322 |        "<keras.callbacks.History at 0x7fb85a195978>"
323 |       ]
324 |      },
325 |      "execution_count": 17,
326 |      "metadata": {},
327 |      "output_type": "execute_result"
328 |     }
329 |    ],
330 |    "source": [
331 |     "import os\n",
332 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
333 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"6\"\n",
334 |     "\n",
335 |     "from keras.models import Sequential\n",
336 |     "from keras.layers import Dense, Activation\n",
337 |     "\n",
338 |     "model = Sequential()\n",
339 |     "model.add(Dense(len(classes), input_dim=2048))\n",
340 |     "model.add(Activation('softmax'))\n",
341 |     "\n",
342 |     "model.compile(loss='sparse_categorical_crossentropy',\n",
343 |     "              optimizer='sgd',\n",
344 |     "              metrics=['accuracy'])\n",
345 |     "\n",
346 |     "model.fit(x_trn, y_trn_idx, epochs=5, batch_size=32)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 18,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "y_tst_idx = [classes2idx[x] for x in y_tst]\n",
356 |     "y_val_idx = [classes2idx[x] for x in y_val]"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 19,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stdout",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "\r",
369 |       "100/100 [==============================] - 0s 384us/step\n"
370 |      ]
371 |     },
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "[0.3620190918445587, 0.9100000262260437]"
376 |       ]
377 |      },
378 |      "execution_count": 19,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "model.evaluate(x_tst, y_tst_idx, batch_size=128)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 20,
390 |    "metadata": {},
391 |    "outputs": [
392 |     {
393 |      "name": "stdout",
394 |      "output_type": "stream",
395 |      "text": [
396 |       "1000/1000 [==============================] - 0s 35us/step\n"
397 |      ]
398 |     },
399 |     {
400 |      "data": {
401 |       "text/plain": [
402 |        "[0.6134206886291504, 0.8280000019073487]"
403 |       ]
404 |      },
405 |      "execution_count": 20,
406 |      "metadata": {},
407 |      "output_type": "execute_result"
408 |     }
409 |    ],
410 |    "source": [
411 |     "model.evaluate(x_val, y_val_idx, batch_size=128)"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 22,
417 |    "metadata": {},
418 |    "outputs": [
419 |     {
420 |      "name": "stdout",
421 |      "output_type": "stream",
422 |      "text": [
423 |       "Epoch 1/5\n",
424 |       "996559/996559 [==============================] - 122s 123us/step - loss: 0.4698 - acc: 0.8701\n",
425 |       "Epoch 2/5\n",
426 |       "996559/996559 [==============================] - 123s 124us/step - loss: 0.4491 - acc: 0.8758\n",
427 |       "Epoch 3/5\n",
428 |       "996559/996559 [==============================] - 121s 122us/step - loss: 0.4316 - acc: 0.8806\n",
429 |       "Epoch 4/5\n",
430 |       "996559/996559 [==============================] - 121s 121us/step - loss: 0.4163 - acc: 0.8849\n",
431 |       "Epoch 5/5\n",
432 |       "996559/996559 [==============================] - 121s 122us/step - loss: 0.4026 - acc: 0.8889\n"
433 |      ]
434 |     },
435 |     {
436 |      "data": {
437 |       "text/plain": [
438 |        "<keras.callbacks.History at 0x7fb8580fb6a0>"
439 |       ]
440 |      },
441 |      "execution_count": 22,
442 |      "metadata": {},
443 |      "output_type": "execute_result"
444 |     }
445 |    ],
446 |    "source": [
447 |     "model.fit(x_trn, y_trn_idx, epochs=5, batch_size=32)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 23,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "name": "stdout",
457 |      "output_type": "stream",
458 |      "text": [
459 |       "\r",
460 |       "100/100 [==============================] - 0s 33us/step\n"
461 |      ]
462 |     },
463 |     {
464 |      "data": {
465 |       "text/plain": [
466 |        "[0.33740347623825073, 0.9100000262260437]"
467 |       ]
468 |      },
469 |      "execution_count": 23,
470 |      "metadata": {},
471 |      "output_type": "execute_result"
472 |     }
473 |    ],
474 |    "source": [
475 |     "model.evaluate(x_tst, y_tst_idx, batch_size=128)"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 24,
481 |    "metadata": {},
482 |    "outputs": [
483 |     {
484 |      "name": "stdout",
485 |      "output_type": "stream",
486 |      "text": [
487 |       "1000/1000 [==============================] - 0s 42us/step\n"
488 |      ]
489 |     },
490 |     {
491 |      "data": {
492 |       "text/plain": [
493 |        "[0.5951442904472352, 0.8239999980926513]"
494 |       ]
495 |      },
496 |      "execution_count": 24,
497 |      "metadata": {},
498 |      "output_type": "execute_result"
499 |     }
500 |    ],
501 |    "source": [
502 |     "model.evaluate(x_val, y_val_idx, batch_size=128)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 25,
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "Epoch 1/5\n",
515 |       "996559/996559 [==============================] - 126s 126us/step - loss: 0.3904 - acc: 0.8924\n",
516 |       "Epoch 2/5\n",
517 |       "996559/996559 [==============================] - 123s 123us/step - loss: 0.3790 - acc: 0.8960\n",
518 |       "Epoch 3/5\n",
519 |       "996559/996559 [==============================] - 115s 116us/step - loss: 0.3687 - acc: 0.8989\n",
520 |       "Epoch 4/5\n",
521 |       "996559/996559 [==============================] - 115s 116us/step - loss: 0.3591 - acc: 0.9015\n",
522 |       "Epoch 5/5\n",
523 |       "996559/996559 [==============================] - 124s 124us/step - loss: 0.3503 - acc: 0.9043\n"
524 |      ]
525 |     },
526 |     {
527 |      "data": {
528 |       "text/plain": [
529 |        "<keras.callbacks.History at 0x7fb9771b0eb8>"
530 |       ]
531 |      },
532 |      "execution_count": 25,
533 |      "metadata": {},
534 |      "output_type": "execute_result"
535 |     }
536 |    ],
537 |    "source": [
538 |     "model.fit(x_trn, y_trn_idx, epochs=5, batch_size=32)"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": 26,
544 |    "metadata": {},
545 |    "outputs": [
546 |     {
547 |      "name": "stdout",
548 |      "output_type": "stream",
549 |      "text": [
550 |       "\r",
551 |       "100/100 [==============================] - 0s 49us/step\n"
552 |      ]
553 |     },
554 |     {
555 |      "data": {
556 |       "text/plain": [
557 |        "[0.34398719668388367, 0.8999999761581421]"
558 |       ]
559 |      },
560 |      "execution_count": 26,
561 |      "metadata": {},
562 |      "output_type": "execute_result"
563 |     }
564 |    ],
565 |    "source": [
566 |     "model.evaluate(x_tst, y_tst_idx, batch_size=128)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 27,
572 |    "metadata": {},
573 |    "outputs": [
574 |     {
575 |      "name": "stdout",
576 |      "output_type": "stream",
577 |      "text": [
578 |       "1000/1000 [==============================] - 0s 30us/step\n"
579 |      ]
580 |     },
581 |     {
582 |      "data": {
583 |       "text/plain": [
584 |        "[0.5886992530822754, 0.8349999971389771]"
585 |       ]
586 |      },
587 |      "execution_count": 27,
588 |      "metadata": {},
589 |      "output_type": "execute_result"
590 |     }
591 |    ],
592 |    "source": [
593 |     "model.evaluate(x_val, y_val_idx, batch_size=128)"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 10,
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "data": {
603 |       "text/plain": [
604 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
605 |        "           metric_params=None, n_jobs=None, n_neighbors=2, p=2,\n",
606 |        "           weights='uniform')"
607 |       ]
608 |      },
609 |      "execution_count": 10,
610 |      "metadata": {},
611 |      "output_type": "execute_result"
612 |     }
613 |    ],
614 |    "source": [
615 |     "from sklearn.neighbors import KNeighborsClassifier\n",
616 |     "neigh2 = KNeighborsClassifier(n_neighbors=2)\n",
617 |     "neigh2.fit(x_trn, y_trn)"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": 11,
623 |    "metadata": {},
624 |    "outputs": [
625 |     {
626 |      "data": {
627 |       "text/plain": [
628 |        "0.81"
629 |       ]
630 |      },
631 |      "execution_count": 11,
632 |      "metadata": {},
633 |      "output_type": "execute_result"
634 |     }
635 |    ],
636 |    "source": [
637 |     "neigh2.score(x_tst, y_tst)"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": 12,
643 |    "metadata": {},
644 |    "outputs": [
645 |     {
646 |      "data": {
647 |       "text/plain": [
648 |        "0.67"
649 |       ]
650 |      },
651 |      "execution_count": 12,
652 |      "metadata": {},
653 |      "output_type": "execute_result"
654 |     }
655 |    ],
656 |    "source": [
657 |     "neigh2.score(x_val, y_val)"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 13,
663 |    "metadata": {},
664 |    "outputs": [
665 |     {
666 |      "data": {
667 |       "text/plain": [
668 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
669 |        "           metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
670 |        "           weights='uniform')"
671 |       ]
672 |      },
673 |      "execution_count": 13,
674 |      "metadata": {},
675 |      "output_type": "execute_result"
676 |     }
677 |    ],
678 |    "source": [
679 |     "from sklearn.neighbors import KNeighborsClassifier\n",
680 |     "neigh5 = KNeighborsClassifier(n_neighbors=5)\n",
681 |     "neigh5.fit(x_trn, y_trn)"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": 14,
687 |    "metadata": {},
688 |    "outputs": [
689 |     {
690 |      "data": {
691 |       "text/plain": [
692 |        "0.83"
693 |       ]
694 |      },
695 |      "execution_count": 14,
696 |      "metadata": {},
697 |      "output_type": "execute_result"
698 |     }
699 |    ],
700 |    "source": [
701 |     "neigh5.score(x_tst, y_tst)"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": 15,
707 |    "metadata": {},
708 |    "outputs": [
709 |     {
710 |      "data": {
711 |       "text/plain": [
712 |        "0.735"
713 |       ]
714 |      },
715 |      "execution_count": 15,
716 |      "metadata": {},
717 |      "output_type": "execute_result"
718 |     }
719 |    ],
720 |    "source": [
721 |     "neigh5.score(x_val, y_val)"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "code",
726 |    "execution_count": null,
727 |    "metadata": {},
728 |    "outputs": [],
729 |    "source": []
730 |   }
731 |  ],
732 |  "metadata": {
733 |   "kernelspec": {
734 |    "display_name": "Python 3",
735 |    "language": "python",
736 |    "name": "python3"
737 |   },
738 |   "language_info": {
739 |    "codemirror_mode": {
740 |     "name": "ipython",
741 |     "version": 3
742 |    },
743 |    "file_extension": ".py",
744 |    "mimetype": "text/x-python",
745 |    "name": "python",
746 |    "nbconvert_exporter": "python",
747 |    "pygments_lexer": "ipython3",
748 |    "version": "3.5.2"
749 |   }
750 |  },
751 |  "nbformat": 4,
752 |  "nbformat_minor": 2
753 | }
754 | 


--------------------------------------------------------------------------------
/use_case/DataAcquisition/Shapley.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.metrics import roc_auc_score, f1_score
  6 | 
  7 | class ShapNN(object):
  8 |     
  9 |     def __init__(self, mode, hidden_units=[100], learning_rate=0.001, 
 10 |                  dropout = 0., activation=None, initializer=None,
 11 |                  weight_decay=0.0001, optimizer='adam', batch_size=128,
 12 |                  warm_start=False, max_epochs=100, validation_fraction=0.1,
 13 |                  early_stopping=0, address=None, test_batch_size=1000,
 14 |                  random_seed=666):
 15 |         
 16 |         self.mode = mode
 17 |         self.batch_size = batch_size
 18 |         self.test_batch_size = test_batch_size
 19 |         self.hidden_units = hidden_units
 20 |         self.initializer = initializer
 21 |         self.activation = activation
 22 |         self.dropout = dropout
 23 |         self.weight_decay = weight_decay
 24 |         self.optimizer = optimizer
 25 |         self.learning_rate = learning_rate
 26 |         self.warm_start = warm_start
 27 |         self.max_epochs = max_epochs
 28 |         self.early_stopping = early_stopping
 29 |         self.validation_fraction = validation_fraction
 30 |         self.address = address
 31 |         self._extra_train_ops = []
 32 |         self.random_seed = random_seed
 33 |         self.is_built = False
 34 | 
 35 |     def prediction_cost(self, X_test, y_test, batch_size=None):
 36 |         
 37 |         if batch_size is None:
 38 |             batch_size = self.test_batch_size
 39 |         assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!'
 40 |         with self.graph.as_default():
 41 |             losses = []
 42 |             idxs = np.arange(len(X_test))            
 43 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 44 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 45 |             for batch in batches:
 46 |                 losses.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch],
 47 |                                                                    self.labels:y_test[batch]}))
 48 |             return np.mean(losses)     
 49 |         
 50 |     def score(self, X_test, y_test, batch_size=None):
 51 |         
 52 |         if batch_size is None:
 53 |             batch_size = self.test_batch_size
 54 |         assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!'
 55 |         with self.graph.as_default():
 56 |             scores = []
 57 |             idxs = np.arange(len(X_test))     
 58 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 59 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 60 |             for batch in batches:
 61 |                 scores.append(self.sess.run(self.prediction_score, {self.input_ph:X_test[batch],
 62 |                                                                    self.labels:y_test[batch]}))
 63 |             return np.mean(scores)
 64 |         
 65 |     def predict_proba(self, X_test, batch_size=None):
 66 |         
 67 |         if batch_size is None:
 68 |             batch_size = self.test_batch_size
 69 |         with self.graph.as_default():
 70 |             probs = []
 71 |             idxs = np.arange(len(X_test))     
 72 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 73 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 74 |             for batch in batches:
 75 |                 probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]}))
 76 |             return np.concatenate(probs, axis=0)    
 77 |         
 78 |     def predict_log_proba(self, X_test, batch_size=None):
 79 |         
 80 |         if batch_size is None:
 81 |             batch_size = self.test_batch_size
 82 |         with self.graph.as_default():
 83 |             probs = []
 84 |             idxs = np.arange(len(X_test))            
 85 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 86 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 87 |             for batch in batches:
 88 |                 probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]}))
 89 |             return np.log(np.clip(np.concatenate(probs), 1e-12, None))   
 90 |         
 91 |     def cost(self, X_test, y_test, batch_size=None):
 92 |         
 93 |         if batch_size is None:
 94 |             batch_size = self.batch_size
 95 |         with self.graph.as_default():
 96 |             losss = []
 97 |             idxs = np.arange(len(X_test))            
 98 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 99 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
100 |             for batch in batches:
101 |                 losss.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch],
102 |                                                                    self.labels:y_test[batch]}))
103 |             return np.mean(losss)
104 |     
105 |     def predict(self, X_test, batch_size=None):
106 |         
107 |         if batch_size is None:
108 |             batch_size = self.batch_size
109 |         with self.graph.as_default():
110 |             predictions = []
111 |             idxs = np.arange(len(X_test))
112 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
113 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
114 |             for batch in batches:
115 |                 predictions.append(self.sess.run(self.predictions, {self.input_ph:X_test[batch]}))
116 |             return np.concatenate(predictions)
117 |         
118 |     def fit(self, X, y, X_val=None, y_val=None, sources=None, max_epochs=None,
119 |             batch_size=None, save=False, load=False, sample_weight=None,
120 |             metric='accuracy'):
121 |         
122 |         self.num_classes = len(set(y))
123 |         self.metric = metric
124 |         if max_epochs is None:
125 |             max_epochs = self.max_epochs
126 |         if batch_size is None:
127 |             batch_size = self.batch_size
128 |         if not self.is_built:
129 |             self.graph = tf.Graph() 
130 |             with self.graph.as_default():
131 |                 config = tf.ConfigProto()
132 |                 config.gpu_options.allow_growth=True
133 |                 self.sess = tf.Session(config=config)
134 |         with self.graph.as_default():
135 |             tf.set_random_seed(self.random_seed)
136 |             try:
137 |                 self.global_step = tf.train.create_global_step()
138 |             except ValueError:
139 |                 self.global_step = tf.train.get_global_step()
140 |             if not self.is_built:
141 |                 self._build_model(X, y)
142 |                 self.saver = tf.train.Saver()
143 |             self._initialize()
144 |             if len(X):
145 |                 if X_val is None and self.validation_fraction * len(X) > 2:
146 |                     X_train, X_val, y_train, y_val, sample_weight, _ = train_test_split(
147 |                         X, y, sample_weight, test_size=self.validation_fraction)
148 |                 else:
149 |                     X_train, y_train = X, y
150 |                 self._train_model(X_train, y_train, X_val=X_val, y_val=y_val,
151 |                                   max_epochs=max_epochs, batch_size=batch_size,
152 |                                   sources=sources, sample_weight=sample_weight)
153 |                 if save and self.address is not None:
154 |                     self.saver.save(self.sess, self.address)
155 |             
156 |     def _train_model(self, X, y, X_val, y_val, max_epochs, batch_size, 
157 |                      sources=None, sample_weight=None):
158 |         
159 |         
160 |         assert len(X)==len(y), 'Input and labels not the same size'
161 |         self.history = {'metrics':[], 'idxs':[]}
162 |         stop_counter = 0
163 |         best_performance = None
164 |         for epoch in range(max_epochs):
165 |             vals_metrics, idxs = self._one_epoch(
166 |                 X, y, X_val, y_val, batch_size, sources=sources, sample_weight=sample_weight)
167 |             self.history['idxs'].append(idxs)
168 |             self.history['metrics'].append(vals_metrics)
169 |             if self.early_stopping and X_val is not None:
170 |                 current_performance = np.mean(val_acc)
171 |                 if best_performance is None:
172 |                     best_performance = current_performance
173 |                 if current_performance > best_performance:
174 |                     best_performance = current_performance
175 |                     stop_counter = 0
176 |                 else:
177 |                     stop_counter += 1
178 |                     if stop_counter > self.early_stopping:
179 |                         break
180 |         
181 |     def _one_epoch(self, X, y, X_val, y_val, batch_size, sources=None, sample_weight=None):
182 |         
183 |         vals = []
184 |         if sources is None:
185 |             if sample_weight is None:
186 |                 idxs = np.random.permutation(len(X))
187 |             else:
188 |                 idxs = np.random.choice(len(X), len(X), p=sample_weight/np.sum(sample_weight))    
189 |             batches = [idxs[k*batch_size:(k+1) * batch_size]
190 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
191 |             idxs = batches
192 |         else:
193 |             idxs = np.random.permutation(len(sources.keys()))
194 |             batches = [sources[i] for i in idxs]
195 |         for batch_counter, batch in enumerate(batches):
196 |             self.sess.run(self.train_op, 
197 |                           {self.input_ph:X[batch], self.labels:y[batch],
198 |                            self.dropout_ph:self.dropout})
199 |             if X_val is not None:
200 |                 if self.metric=='accuracy':
201 |                     vals.append(self.score(X_val, y_val))
202 |                 elif self.metric=='f1':
203 |                     vals.append(f1_score(y_val, self.predict(X_val)))
204 |                 elif self.metric=='auc':
205 |                     vals.append(roc_auc_score(y_val, self.predict_proba(X_val)[:,1]))
206 |                 elif self.metric=='xe':
207 |                     vals.append(-self.prediction_cost(X_val, y_val))
208 |         return np.array(vals), np.array(idxs)
209 |     
210 |     def _initialize(self):
211 |         
212 |         uninitialized_vars = []
213 |         if self.warm_start:
214 |             for var in tf.global_variables():
215 |                 try:
216 |                     self.sess.run(var)
217 |                 except tf.errors.FailedPreconditionError:
218 |                     uninitialized_vars.append(var)
219 |         else:
220 |             uninitialized_vars = tf.global_variables()
221 |         self.sess.run(tf.initializers.variables(uninitialized_vars))
222 |         
223 |     def _build_model(self, X, y):
224 |         
225 |         self.num_classes = len(set(y))
226 |         if self.initializer is None:
227 |             initializer = tf.initializers.variance_scaling(distribution='uniform')
228 |         if self.activation is None:
229 |             activation = lambda x: tf.nn.relu(x)
230 |         self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input')
231 |         self.dropout_ph = tf.placeholder_with_default(
232 |             tf.constant(0., dtype=tf.float32), shape=(), name='dropout')
233 |         if self.mode=='regression':
234 |             self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label')
235 |         else:
236 |             self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label')
237 |         x = tf.reshape(self.input_ph, shape=(-1, np.prod(X.shape[1:])))
238 |         for layer, hidden_unit in enumerate(self.hidden_units):
239 |             with tf.variable_scope('dense_{}'.format(layer)):
240 |                 x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 
241 |                            initializer=self.initializer, activation=activation)
242 |         with tf.variable_scope('final'):
243 |             self.prelogits = x
244 |             self._final_layer(self.prelogits, self.num_classes, self.mode)
245 |         self._build_train_op()
246 |         
247 |     def _build_train_op(self):
248 |         
249 |         """Build taining specific ops for the graph."""
250 |         learning_rate = tf.constant(self.learning_rate, tf.float32) ##fixit
251 |         trainable_variables = tf.trainable_variables()
252 |         grads = tf.gradients(self.loss, trainable_variables)
253 |         self.grad_flat = tf.concat([tf.reshape(grad, (-1, 1)) for grad in grads], axis=0)
254 |         if self.optimizer == 'sgd':
255 |             optimizer = tf.train.GradientDescentOptimizer(learning_rate)
256 |         elif self.optimizer == 'mom':
257 |             optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
258 |         elif self.optimizer == 'adam':
259 |             optimizer = tf.train.AdamOptimizer(learning_rate)
260 |         apply_op = optimizer.apply_gradients(
261 |             zip(grads, trainable_variables),
262 |             global_step=self.global_step, name='train_step')
263 |         train_ops = [apply_op] + self._extra_train_ops + tf.get_collection(tf.GraphKeys.UPDATE_OPS)
264 |         previous_ops = [tf.group(*train_ops)]
265 |         with tf.control_dependencies(previous_ops):
266 |             self.train_op = tf.no_op(name='train')   
267 |         self.is_built = True
268 |     
269 |     def _final_layer(self, x, num_classes, mode):
270 |         
271 |         if mode=='regression':
272 |             self.logits = self._dense(x, 1, dropout=self.dropout_ph)
273 |             self.predictions = tf.reduce_sum(self.logits, axis=-1)
274 |             regression_loss = tf.nn.l2_loss(self.predictions - self.labels) ##FIXIT
275 |             self.prediction_loss = tf.reduce_mean(regression_loss, name='l2')
276 |             residuals = self.predictions - self.labels
277 |             var_predicted = tf.reduce_mean(residuals**2) - tf.reduce_mean(residuals)**2
278 |             var_labels = tf.reduce_mean(self.labels**2) - tf.reduce_mean(self.labels)**2
279 |             self.prediction_score = 1 - var_predicted/(var_labels + 1e-12)
280 |         else:
281 |             self.logits = self._dense(x, num_classes, dropout=self.dropout_ph)
282 |             self.probs = tf.nn.softmax(self.logits)
283 |             xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
284 |                 logits=self.logits, labels=tf.cast(self.labels, tf.int32))
285 |             self.prediction_loss = tf.reduce_mean(xent_loss, name='xent')
286 |             self.predictions = tf.argmax(self.probs, axis=-1, output_type=tf.int32)
287 |             correct_predictions = tf.equal(self.predictions, self.labels)
288 |             self.prediction_score = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
289 |         self.loss = self.prediction_loss + self._reg_loss()
290 |                 
291 |     def _dense(self, x, out_dim, dropout=tf.constant(0.), initializer=None, activation=None):
292 |         
293 |         if initializer is None:
294 |             initializer = tf.initializers.variance_scaling(distribution='uniform')
295 |         w = tf.get_variable('DW', [x.get_shape()[1], out_dim], initializer=initializer)
296 |         b = tf.get_variable('Db', [out_dim], initializer=tf.constant_initializer())
297 |         x = tf.nn.dropout(x, 1. - dropout)
298 |         if activation:
299 |             x = activation(x)
300 |         return tf.nn.xw_plus_b(x, w, b)
301 |     
302 |     def _reg_loss(self, order=2):
303 |         """Regularization loss for weight decay."""
304 |         losss = []
305 |         for var in tf.trainable_variables():
306 |             if var.op.name.find(r'DW') > 0 or var.op.name.find(r'CW') > 0: ##FIXIT
307 |                 if order==2:
308 |                     losss.append(tf.nn.l2_loss(var))
309 |                 elif order==1:
310 |                     losss.append(tf.abs(var))
311 |                 else:
312 |                     raise ValueError("Invalid regularization order!")
313 |         return tf.multiply(self.weight_decay, tf.add_n(losss))
314 | 
315 | 
316 | class CShapNN(ShapNN):
317 |     
318 |     def __init__(self, mode, hidden_units=[100], kernel_sizes=[], 
319 |                  strides=None, channels=[], learning_rate=0.001, 
320 |                  dropout = 0., activation=None, initializer=None, global_averaging=False,
321 |                 weight_decay=0.0001, optimizer='adam', batch_size=128, 
322 |                 warm_start=False, max_epochs=100, validation_fraction=0.1,
323 |                 early_stopping=0, address=None, test_batch_size=1000, random_seed=666):
324 |         
325 |         self.mode = mode
326 |         self.test_batch_size = test_batch_size
327 |         self.kernels = []#FIXIT
328 |         self.kernel_sizes = kernel_sizes
329 |         self.channels = channels
330 |         self.global_averaging = global_averaging
331 |         assert len(channels)==len(kernel_sizes), 'Invalid channels or kernel_sizes'
332 |         if strides is None:
333 |             self.strides = [1] * len(kernel_sizes)
334 |         else:
335 |             self.strides = strides
336 |         self.batch_size = batch_size
337 |         self.hidden_units = hidden_units
338 |         self.initializer = initializer
339 |         self.activation = activation
340 |         self.dropout = dropout
341 |         self.weight_decay = weight_decay
342 |         self.optimizer = optimizer
343 |         self.learning_rate = learning_rate
344 |         self.warm_start = warm_start
345 |         self.max_epochs = max_epochs
346 |         self.early_stopping = early_stopping
347 |         self.validation_fraction = validation_fraction
348 |         self.address = address
349 |         self._extra_train_ops = []
350 |         self.random_seed = random_seed
351 |         self.graph = tf.Graph()
352 |         self.is_built = False
353 |         with self.graph.as_default():
354 |             config = tf.ConfigProto()
355 |             config.gpu_options.allow_growth=True
356 |             self.sess = tf.Session(config=config)
357 |             
358 |     def _conv(self, x, filter_size, out_filters, strides, activation=None):
359 |         
360 |         in_filters = int(x.get_shape()[-1])
361 |         n = filter_size * filter_size * out_filters
362 |         kernel = tf.get_variable(
363 |             'DW', [filter_size, filter_size, in_filters, out_filters],
364 |             tf.float32, initializer=tf.random_normal_initializer(
365 |                 stddev=np.sqrt(2.0/n)))
366 |         self.kernels.append(kernel)
367 |         x = tf.nn.conv2d(x, kernel, strides, padding='SAME')
368 |         if activation:
369 |             x = activation(x)
370 |         return x
371 |     
372 |     def _stride_arr(self, stride):
373 |         
374 |         if isinstance(stride, int):
375 |             return [1, stride, stride, 1]
376 |         if len(stride)==2:
377 |             return [1, stride[0], stride[1], 1]
378 |         if len(stride)==4:
379 |             return stride
380 |         raise ValueError('Invalid value!')  
381 |         
382 |     def _build_model(self, X, y):
383 |         
384 |         
385 |         if self.initializer is None:
386 |             initializer = tf.initializers.variance_scaling(distribution='uniform')
387 |         if self.activation is None:
388 |             activation = lambda x: tf.nn.relu(x)
389 |         self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input')
390 |         self.dropout_ph = tf.placeholder_with_default(
391 |             tf.constant(0., dtype=tf.float32), shape=(), name='dropout')
392 |         if self.mode=='regression':
393 |             self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label')
394 |         else:
395 |             self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label')
396 |         if len(X.shape[1:]) == 2:
397 |             x = tf.reshape(self.input_ph, [-1, X.shape[0], X.shape[1], 1])
398 |         else:
399 |             x = self.input_ph
400 |         for layer, (kernel_size, channels, stride) in enumerate(zip(
401 |             self.kernel_sizes, self.channels, self.strides)):
402 |             with tf.variable_scope('conv_{}'.format(layer)):
403 |                 x = self._conv(x, kernel_size, channels, self._stride_arr(stride), activation=activation)
404 |         if self.global_averaging:
405 |             x = tf.reduce_mean(x, axis=(1,2))
406 |         else:
407 |             x = tf.reshape(x, shape=(-1, np.prod(x.get_shape()[1:])))
408 |         for layer, hidden_unit in enumerate(self.hidden_units):
409 |             with tf.variable_scope('dense_{}'.format(layer)):
410 |                 x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 
411 |                            initializer=self.initializer, activation=activation)
412 |                 
413 |         with tf.variable_scope('final'):
414 |             self.prelogits = x
415 |             self._final_layer(self.prelogits, len(set(y)), self.mode)
416 |         self._build_train_op()
417 | 


--------------------------------------------------------------------------------
/use_case/Noisy Label, Watermarking/Shapley.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.metrics import roc_auc_score, f1_score
  6 | 
  7 | class ShapNN(object):
  8 |     
  9 |     def __init__(self, mode, hidden_units=[100], learning_rate=0.001, 
 10 |                  dropout = 0., activation=None, initializer=None,
 11 |                  weight_decay=0.0001, optimizer='adam', batch_size=128,
 12 |                  warm_start=False, max_epochs=100, validation_fraction=0.1,
 13 |                  early_stopping=0, address=None, test_batch_size=1000,
 14 |                  random_seed=666, num_classes=10):
 15 |         
 16 |         self.mode = mode
 17 |         self.batch_size = batch_size
 18 |         self.test_batch_size = test_batch_size
 19 |         self.hidden_units = hidden_units
 20 |         self.initializer = initializer
 21 |         self.activation = activation
 22 |         self.dropout = dropout
 23 |         self.weight_decay = weight_decay
 24 |         self.optimizer = optimizer
 25 |         self.learning_rate = learning_rate
 26 |         self.warm_start = warm_start
 27 |         self.max_epochs = max_epochs
 28 |         self.early_stopping = early_stopping
 29 |         self.validation_fraction = validation_fraction
 30 |         self.address = address
 31 |         self._extra_train_ops = []
 32 |         self.random_seed = random_seed
 33 |         self.is_built = False
 34 |         self.num_classes = num_classes
 35 | 
 36 |     def prediction_cost(self, X_test, y_test, batch_size=None):
 37 |         
 38 |         if batch_size is None:
 39 |             batch_size = self.test_batch_size
 40 |         # assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!'
 41 |         with self.graph.as_default():
 42 |             losses = []
 43 |             idxs = np.arange(len(X_test))            
 44 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 45 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 46 |             for batch in batches:
 47 |                 losses.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch],
 48 |                                                                    self.labels:y_test[batch]}))
 49 |             return np.mean(losses)     
 50 |         
 51 |     def score(self, X_test, y_test, batch_size=None):
 52 |         
 53 |         if batch_size is None:
 54 |             batch_size = self.test_batch_size
 55 |         # assert len(set(y_test)) == self.num_classes, 'Number of classes does not match!'
 56 |         with self.graph.as_default():
 57 |             scores = []
 58 |             idxs = np.arange(len(X_test))     
 59 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 60 |                        for k in range(int(np.ceil(1.0 * len(idxs)/batch_size)))]
 61 |             for batch in batches:
 62 |                 scores.append(self.sess.run(self.prediction_score, {self.input_ph:X_test[batch],
 63 |                                                                    self.labels:y_test[batch]}))
 64 |             return np.mean(scores)
 65 |         
 66 |     def predict_proba(self, X_test, batch_size=None):
 67 |         
 68 |         if batch_size is None:
 69 |             batch_size = self.test_batch_size
 70 |         with self.graph.as_default():
 71 |             probs = []
 72 |             idxs = np.arange(len(X_test))     
 73 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 74 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 75 |             for batch in batches:
 76 |                 probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]}))
 77 |             return np.concatenate(probs, axis=0)    
 78 |         
 79 |     def predict_log_proba(self, X_test, batch_size=None):
 80 |         
 81 |         if batch_size is None:
 82 |             batch_size = self.test_batch_size
 83 |         with self.graph.as_default():
 84 |             probs = []
 85 |             idxs = np.arange(len(X_test))            
 86 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
 87 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
 88 |             for batch in batches:
 89 |                 probs.append(self.sess.run(self.probs, {self.input_ph:X_test[batch]}))
 90 |             return np.log(np.clip(np.concatenate(probs), 1e-12, None))   
 91 |         
 92 |     def cost(self, X_test, y_test, batch_size=None):
 93 |         
 94 |         if batch_size is None:
 95 |             batch_size = self.batch_size
 96 |         with self.graph.as_default():
 97 |             losss = []
 98 |             idxs = np.arange(len(X_test))            
 99 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
100 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
101 |             for batch in batches:
102 |                 losss.append(self.sess.run(self.prediction_loss, {self.input_ph:X_test[batch],
103 |                                                                    self.labels:y_test[batch]}))
104 |             return np.mean(losss)
105 |     
106 |     def predict(self, X_test, batch_size=None):
107 |         
108 |         if batch_size is None:
109 |             batch_size = self.batch_size
110 |         with self.graph.as_default():
111 |             predictions = []
112 |             idxs = np.arange(len(X_test))
113 |             batches = [idxs[k * batch_size: (k+1) * batch_size] 
114 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
115 |             for batch in batches:
116 |                 predictions.append(self.sess.run(self.predictions, {self.input_ph:X_test[batch]}))
117 |             return np.concatenate(predictions)
118 |         
119 |     def fit(self, X, y, X_val=None, y_val=None, sources=None, max_epochs=None,
120 |             batch_size=None, save=False, load=False, sample_weight=None,
121 |             metric='accuracy'):
122 |         
123 |         # self.num_classes = len(set(y_val))
124 |         self.metric = metric
125 |         if max_epochs is None:
126 |             max_epochs = self.max_epochs
127 |         if batch_size is None:
128 |             batch_size = self.batch_size
129 |         if not self.is_built:
130 |             self.graph = tf.Graph() 
131 |             with self.graph.as_default():
132 |                 config = tf.ConfigProto()
133 |                 config.gpu_options.allow_growth=True
134 |                 self.sess = tf.Session(config=config)
135 |         with self.graph.as_default():
136 |             tf.set_random_seed(self.random_seed)
137 |             try:
138 |                 self.global_step = tf.train.create_global_step()
139 |             except ValueError:
140 |                 self.global_step = tf.train.get_global_step()
141 |             if not self.is_built:
142 |                 self._build_model(X, y)
143 |                 self.saver = tf.train.Saver()
144 |             self._initialize()
145 |             if len(X):
146 |                 if X_val is None and self.validation_fraction * len(X) > 2:
147 |                     X_train, X_val, y_train, y_val, sample_weight, _ = train_test_split(
148 |                         X, y, sample_weight, test_size=self.validation_fraction)
149 |                 else:
150 |                     X_train, y_train = X, y
151 |                 self._train_model(X_train, y_train, X_val=X_val, y_val=y_val,
152 |                                   max_epochs=max_epochs, batch_size=batch_size,
153 |                                   sources=sources, sample_weight=sample_weight)
154 |                 if save and self.address is not None:
155 |                     self.saver.save(self.sess, self.address)
156 |             
157 |     def _train_model(self, X, y, X_val, y_val, max_epochs, batch_size, 
158 |                      sources=None, sample_weight=None):
159 |         
160 |         
161 |         assert len(X)==len(y), 'Input and labels not the same size'
162 |         self.history = {'metrics':[], 'idxs':[]}
163 |         stop_counter = 0
164 |         best_performance = None
165 |         for epoch in range(max_epochs):
166 |             vals_metrics, idxs = self._one_epoch(
167 |                 X, y, X_val, y_val, batch_size, sources=sources, sample_weight=sample_weight)
168 |             self.history['idxs'].append(idxs)
169 |             self.history['metrics'].append(vals_metrics)
170 |             if self.early_stopping and X_val is not None:
171 |                 current_performance = np.mean(val_acc)
172 |                 if best_performance is None:
173 |                     best_performance = current_performance
174 |                 if current_performance > best_performance:
175 |                     best_performance = current_performance
176 |                     stop_counter = 0
177 |                 else:
178 |                     stop_counter += 1
179 |                     if stop_counter > self.early_stopping:
180 |                         break
181 |         
182 |     def _one_epoch(self, X, y, X_val, y_val, batch_size, sources=None, sample_weight=None):
183 |         
184 |         vals = []
185 |         if sources is None:
186 |             if sample_weight is None:
187 |                 idxs = np.random.permutation(len(X))
188 |             else:
189 |                 idxs = np.random.choice(len(X), len(X), p=sample_weight/np.sum(sample_weight))    
190 |             batches = [idxs[k*batch_size:(k+1) * batch_size]
191 |                        for k in range(int(np.ceil(len(idxs)/batch_size)))]
192 |             idxs = batches
193 |         else:
194 |             idxs = np.random.permutation(len(sources.keys()))
195 |             batches = [sources[i] for i in idxs]
196 |         for batch_counter, batch in enumerate(batches):
197 |             self.sess.run(self.train_op, 
198 |                           {self.input_ph:X[batch], self.labels:y[batch],
199 |                            self.dropout_ph:self.dropout})
200 |             if X_val is not None:
201 |                 if self.metric=='accuracy':
202 |                     vals.append(self.score(X_val, y_val))
203 |                 elif self.metric=='f1':
204 |                     vals.append(f1_score(y_val, self.predict(X_val)))
205 |                 elif self.metric=='auc':
206 |                     vals.append(roc_auc_score(y_val, self.predict_proba(X_val)[:,1]))
207 |                 elif self.metric=='xe':
208 |                     vals.append(-self.prediction_cost(X_val, y_val))
209 |         return np.array(vals), np.array(idxs)
210 |     
211 |     def _initialize(self):
212 |         
213 |         uninitialized_vars = []
214 |         if self.warm_start:
215 |             for var in tf.global_variables():
216 |                 try:
217 |                     self.sess.run(var)
218 |                 except tf.errors.FailedPreconditionError:
219 |                     uninitialized_vars.append(var)
220 |         else:
221 |             uninitialized_vars = tf.global_variables()
222 |         self.sess.run(tf.initializers.variables(uninitialized_vars))
223 |         
224 |     def _build_model(self, X, y):
225 |         
226 |         # self.num_classes = len(set(y))
227 |         if self.initializer is None:
228 |             initializer = tf.initializers.variance_scaling(distribution='uniform')
229 |         if self.activation is None:
230 |             activation = lambda x: tf.nn.relu(x)
231 |         self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input')
232 |         self.dropout_ph = tf.placeholder_with_default(
233 |             tf.constant(0., dtype=tf.float32), shape=(), name='dropout')
234 |         if self.mode=='regression':
235 |             self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label')
236 |         else:
237 |             self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label')
238 |         x = tf.reshape(self.input_ph, shape=(-1, np.prod(X.shape[1:])))
239 |         for layer, hidden_unit in enumerate(self.hidden_units):
240 |             with tf.variable_scope('dense_{}'.format(layer)):
241 |                 x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 
242 |                            initializer=self.initializer, activation=activation)
243 |         with tf.variable_scope('final'):
244 |             self.prelogits = x
245 |             self._final_layer(self.prelogits, self.num_classes, self.mode)
246 |         self._build_train_op()
247 |         
248 |     def _build_train_op(self):
249 |         
250 |         """Build taining specific ops for the graph."""
251 |         learning_rate = tf.constant(self.learning_rate, tf.float32) ##fixit
252 |         trainable_variables = tf.trainable_variables()
253 |         grads = tf.gradients(self.loss, trainable_variables)
254 |         self.grad_flat = tf.concat([tf.reshape(grad, (-1, 1)) for grad in grads], axis=0)
255 |         if self.optimizer == 'sgd':
256 |             optimizer = tf.train.GradientDescentOptimizer(learning_rate)
257 |         elif self.optimizer == 'mom':
258 |             optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
259 |         elif self.optimizer == 'adam':
260 |             optimizer = tf.train.AdamOptimizer(learning_rate)
261 |         apply_op = optimizer.apply_gradients(
262 |             zip(grads, trainable_variables),
263 |             global_step=self.global_step, name='train_step')
264 |         train_ops = [apply_op] + self._extra_train_ops + tf.get_collection(tf.GraphKeys.UPDATE_OPS)
265 |         previous_ops = [tf.group(*train_ops)]
266 |         with tf.control_dependencies(previous_ops):
267 |             self.train_op = tf.no_op(name='train')   
268 |         self.is_built = True
269 |     
270 |     def _final_layer(self, x, num_classes, mode):
271 |         
272 |         if mode=='regression':
273 |             self.logits = self._dense(x, 1, dropout=self.dropout_ph)
274 |             self.predictions = tf.reduce_sum(self.logits, axis=-1)
275 |             regression_loss = tf.nn.l2_loss(self.predictions - self.labels) ##FIXIT
276 |             self.prediction_loss = tf.reduce_mean(regression_loss, name='l2')
277 |             residuals = self.predictions - self.labels
278 |             var_predicted = tf.reduce_mean(residuals**2) - tf.reduce_mean(residuals)**2
279 |             var_labels = tf.reduce_mean(self.labels**2) - tf.reduce_mean(self.labels)**2
280 |             self.prediction_score = 1 - var_predicted/(var_labels + 1e-12)
281 |         else:
282 |             self.logits = self._dense(x, num_classes, dropout=self.dropout_ph)
283 |             self.probs = tf.nn.softmax(self.logits)
284 |             xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
285 |                 logits=self.logits, labels=tf.cast(self.labels, tf.int32))
286 |             self.prediction_loss = tf.reduce_mean(xent_loss, name='xent')
287 |             self.predictions = tf.argmax(self.probs, axis=-1, output_type=tf.int32)
288 |             correct_predictions = tf.equal(self.predictions, self.labels)
289 |             self.prediction_score = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
290 |         self.loss = self.prediction_loss + self._reg_loss()
291 |                 
292 |     def _dense(self, x, out_dim, dropout=tf.constant(0.), initializer=None, activation=None):
293 |         
294 |         if initializer is None:
295 |             initializer = tf.initializers.variance_scaling(distribution='uniform')
296 |         w = tf.get_variable('DW', [x.get_shape()[1], out_dim], initializer=initializer)
297 |         b = tf.get_variable('Db', [out_dim], initializer=tf.constant_initializer())
298 |         x = tf.nn.dropout(x, 1. - dropout)
299 |         if activation:
300 |             x = activation(x)
301 |         return tf.nn.xw_plus_b(x, w, b)
302 |     
303 |     def _reg_loss(self, order=2):
304 |         """Regularization loss for weight decay."""
305 |         losss = []
306 |         for var in tf.trainable_variables():
307 |             if var.op.name.find(r'DW') > 0 or var.op.name.find(r'CW') > 0: ##FIXIT
308 |                 if order==2:
309 |                     losss.append(tf.nn.l2_loss(var))
310 |                 elif order==1:
311 |                     losss.append(tf.abs(var))
312 |                 else:
313 |                     raise ValueError("Invalid regularization order!")
314 |         return tf.multiply(self.weight_decay, tf.add_n(losss))
315 | 
316 | 
317 | class CShapNN(ShapNN):
318 |     
319 |     def __init__(self, mode, hidden_units=[100], kernel_sizes=[], 
320 |                  strides=None, channels=[], learning_rate=0.001, 
321 |                  dropout = 0., activation=None, initializer=None, global_averaging=False,
322 |                 weight_decay=0.0001, optimizer='adam', batch_size=128, 
323 |                 warm_start=False, max_epochs=100, validation_fraction=0.1,
324 |                 early_stopping=0, address=None, test_batch_size=1000, random_seed=666):
325 |         
326 |         self.mode = mode
327 |         self.test_batch_size = test_batch_size
328 |         self.kernels = []#FIXIT
329 |         self.kernel_sizes = kernel_sizes
330 |         self.channels = channels
331 |         self.global_averaging = global_averaging
332 |         assert len(channels)==len(kernel_sizes), 'Invalid channels or kernel_sizes'
333 |         if strides is None:
334 |             self.strides = [1] * len(kernel_sizes)
335 |         else:
336 |             self.strides = strides
337 |         self.batch_size = batch_size
338 |         self.hidden_units = hidden_units
339 |         self.initializer = initializer
340 |         self.activation = activation
341 |         self.dropout = dropout
342 |         self.weight_decay = weight_decay
343 |         self.optimizer = optimizer
344 |         self.learning_rate = learning_rate
345 |         self.warm_start = warm_start
346 |         self.max_epochs = max_epochs
347 |         self.early_stopping = early_stopping
348 |         self.validation_fraction = validation_fraction
349 |         self.address = address
350 |         self._extra_train_ops = []
351 |         self.random_seed = random_seed
352 |         self.graph = tf.Graph()
353 |         self.is_built = False
354 |         with self.graph.as_default():
355 |             config = tf.ConfigProto()
356 |             config.gpu_options.allow_growth=True
357 |             self.sess = tf.Session(config=config)
358 |             
359 |     def _conv(self, x, filter_size, out_filters, strides, activation=None):
360 |         
361 |         in_filters = int(x.get_shape()[-1])
362 |         n = filter_size * filter_size * out_filters
363 |         kernel = tf.get_variable(
364 |             'DW', [filter_size, filter_size, in_filters, out_filters],
365 |             tf.float32, initializer=tf.random_normal_initializer(
366 |                 stddev=np.sqrt(2.0/n)))
367 |         self.kernels.append(kernel)
368 |         x = tf.nn.conv2d(x, kernel, strides, padding='SAME')
369 |         if activation:
370 |             x = activation(x)
371 |         return x
372 |     
373 |     def _stride_arr(self, stride):
374 |         
375 |         if isinstance(stride, int):
376 |             return [1, stride, stride, 1]
377 |         if len(stride)==2:
378 |             return [1, stride[0], stride[1], 1]
379 |         if len(stride)==4:
380 |             return stride
381 |         raise ValueError('Invalid value!')  
382 |         
383 |     def _build_model(self, X, y):
384 |         
385 |         
386 |         if self.initializer is None:
387 |             initializer = tf.initializers.variance_scaling(distribution='uniform')
388 |         if self.activation is None:
389 |             activation = lambda x: tf.nn.relu(x)
390 |         self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + X.shape[1:], name='input')
391 |         self.dropout_ph = tf.placeholder_with_default(
392 |             tf.constant(0., dtype=tf.float32), shape=(), name='dropout')
393 |         if self.mode=='regression':
394 |             self.labels = tf.placeholder(dtype=tf.float32, shape=(None, ), name='label')
395 |         else:
396 |             self.labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name='label')
397 |         if len(X.shape[1:]) == 2:
398 |             x = tf.reshape(self.input_ph, [-1, X.shape[0], X.shape[1], 1])
399 |         else:
400 |             x = self.input_ph
401 |         for layer, (kernel_size, channels, stride) in enumerate(zip(
402 |             self.kernel_sizes, self.channels, self.strides)):
403 |             with tf.variable_scope('conv_{}'.format(layer)):
404 |                 x = self._conv(x, kernel_size, channels, self._stride_arr(stride), activation=activation)
405 |         if self.global_averaging:
406 |             x = tf.reduce_mean(x, axis=(1,2))
407 |         else:
408 |             x = tf.reshape(x, shape=(-1, np.prod(x.get_shape()[1:])))
409 |         for layer, hidden_unit in enumerate(self.hidden_units):
410 |             with tf.variable_scope('dense_{}'.format(layer)):
411 |                 x = self._dense(x, hidden_unit, dropout=self.dropout_ph, 
412 |                            initializer=self.initializer, activation=activation)
413 |                 
414 |         with tf.variable_scope('final'):
415 |             self.prelogits = x
416 |             self._final_layer(self.prelogits, len(set(y)), self.mode)
417 |         self._build_train_op()
418 | 


--------------------------------------------------------------------------------
/use_case/DataAcquisition/utils.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import sys
  5 | import time
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import matplotlib.pyplot as plt
  9 | from tqdm import tqdm_notebook
 10 | from collections import OrderedDict
 11 | from scipy.misc import toimage
 12 | import torch
 13 | import cv2
 14 | import acoustics
 15 | import h5py
 16 | from torch.utils.data import Dataset, DataLoader
 17 | from skimage import io, transform
 18 | from torchvision import transforms, utils
 19 | import torch.optim as optim
 20 | import torch.nn as nn
 21 | import glob
 22 | import re
 23 | from PIL import Image
 24 | 
 25 | 
 26 | class CelebaDataset(Dataset):
 27 |     def __init__(self, label_file, root_dir, transform=None):
 28 |         # root_dir = "data/celeba/img_align_celeba/"
 29 |         self.labels, self.image_idxs = self.load_labels(label_file)
 30 |         self.root_dir = root_dir
 31 |         self.transform = transform
 32 |         
 33 |     def __len__(self):
 34 |         return len(self.labels)
 35 |     
 36 |     def __getitem__(self, idx):
 37 |         if torch.is_tensor(idx):
 38 |             idx = idx.tolist()
 39 |         img_name = os.path.join(self.root_dir, self.image_idxs[idx])
 40 |         image = io.imread(img_name)
 41 | #         image = cv2.resize(image, (299, 299), interpolation=cv2.INTER_CUBIC)
 42 |         sample = {'image': image, 'label': self.labels[idx]}
 43 |         if self.transform:
 44 |             sample = self.transform(sample)
 45 |         return sample
 46 |         
 47 |     def load_labels(self, label_file):
 48 |         # label_file="list_attr_celeba.csv"
 49 |         dir_anno = "data/celeba/"
 50 |         file = open(dir_anno + label_file, 'r')
 51 |         texts = file.read().split("\n")
 52 |         file.close()
 53 |         col_names = texts[0].split(",")
 54 |         Male_idx = col_names.index("Male")
 55 |         gender_list = []
 56 |         image_index_list = []
 57 |         for txt in texts[1:]:
 58 |             image_index_list.append(txt.split(',')[0])
 59 |             if txt.split(',')[Male_idx] == '1':
 60 |                 gender_list.append(np.array(1))
 61 |             elif txt.split(',')[Male_idx] == '-1':
 62 |                 gender_list.append(np.array(0))
 63 |         print(gender_list[:5], len(gender_list))
 64 |         gener_list = np.array(gender_list)
 65 |         return gender_list, image_index_list
 66 |                                 
 67 | class Rescale(object):
 68 |     """Rescale the image in a sample to a given size.
 69 | 
 70 |     Args:
 71 |         output_size (tuple or int): Desired output size. If tuple, output is
 72 |             matched to output_size. If int, smaller of image edges is matched
 73 |             to output_size keeping aspect ratio the same.
 74 |     """
 75 |     def __init__(self, output_size):
 76 |         assert isinstance(output_size, (int, tuple))
 77 |         self.output_size = output_size
 78 |     def __call__(self, sample):
 79 |         image, labels = sample['image'], sample['label']
 80 |         h, w = image.shape[:2]
 81 |         if isinstance(self.output_size, int):
 82 |             if h > w:
 83 |                 new_h, new_w = self.output_size * h / w, self.output_size
 84 |             else:
 85 |                 new_h, new_w = self.output_size, self.output_size * w / h
 86 |         else:
 87 |             new_h, new_w = self.output_size
 88 | 
 89 |         new_h, new_w = int(new_h), int(new_w)
 90 |         img = transform.resize(image, (new_h, new_w))
 91 |         # h and w are swapped for landmarks because for images,
 92 |         # x and y axes are axis 1 and 0 respectively
 93 |         return {'image': img, 'label': labels}
 94 |     
 95 | class ToTensor(object):
 96 |     """Convert ndarrays in sample to Tensors."""
 97 |     def __call__(self, sample):
 98 |         image, labels = sample['image'], sample['label']
 99 | 
100 |         # swap color axis because
101 |         # numpy image: H x W x C
102 |         # torch image: C X H X W
103 |         image = image.transpose((2, 0, 1))
104 |         return {'image': torch.from_numpy(image),
105 |                 'label': torch.from_numpy(labels)}
106 | 
107 | class MNIST():
108 |     def __init__(self, one_hot=True, shuffle=False, by_label=False):
109 |         self.x_train, self.y_train, self.x_test, self.y_test = self.load_data(one_hot, by_label)
110 |         self.num_train = self.x_train.shape[0]
111 |         self.num_test = self.x_test.shape[0]
112 |         if shuffle: self.shuffle_data()
113 | 
114 |     def load_data(self, one_hot, by_label):
115 |         mnist = tf.keras.datasets.mnist
116 |         (x_train, y_train), (x_test, y_test) = mnist.load_data()
117 |         x_train = np.reshape(x_train, [-1, 28, 28, 1])
118 |         x_train = x_train.astype(np.float32) / 255
119 |         x_test = np.reshape(x_test, [-1, 28, 28, 1])
120 |         x_test = x_test.astype(np.float32) / 255
121 | 
122 |         if by_label:
123 |             ind_train = np.argsort(y_train)
124 |             ind_test = np.argsort(y_test)
125 |             x_train, y_train = x_train[ind_train], y_train[ind_train]
126 |             x_test, y_test = x_test[ind_test], y_test[ind_test]
127 | 
128 | 
129 |         if one_hot:
130 |             # convert to one-hot labels
131 |             y_train = tf.keras.utils.to_categorical(y_train)
132 |             y_test = tf.keras.utils.to_categorical(y_test)
133 | 
134 |         return x_train, y_train, x_test, y_test
135 | 
136 | 
137 |     def shuffle_data(self):
138 |         ind = np.random.permutation(self.num_train)
139 |         self.x_train, self.y_train = self.x_train[ind], self.y_train[ind]
140 | 
141 | 
142 | class CIFAR10():
143 |     def __init__(self, one_hot=True, shuffle=False):
144 |         self.x_train, self.y_train, self.x_test, self.y_test = self.load_data(one_hot)
145 |         self.num_train = self.x_train.shape[0]
146 |         self.num_test = self.x_test.shape[0]
147 | 
148 |         if shuffle: self.shuffle_data()
149 | 
150 |     def load_data(self, one_hot):
151 |         cifar = tf.keras.datasets.cifar10
152 |         (x_train, y_train), (x_test, y_test) = cifar.load_data()
153 |         # x_train.shape = (50000, 32, 32, 3), range = [0, 255]
154 |         # y_train.shape = (50000, 1)
155 | 
156 |         y_train = np.squeeze(y_train)
157 |         y_test = np.squeeze(y_test)
158 |         x_train = x_train.astype(np.float32) / 255
159 |         x_test = x_test.astype(np.float32) / 255
160 | 
161 |         if one_hot:
162 |             # convert to one-hot labels
163 |             y_train = tf.keras.utils.to_categorical(y_train)
164 |             y_test = tf.keras.utils.to_categorical(y_test)
165 | 
166 |         return x_train, y_train, x_test, y_test
167 | 
168 | 
169 |     def shuffle_data(self):
170 |         ind = np.random.permutation(self.num_train)
171 |         self.x_train, self.y_train = self.x_train[ind], self.y_train[ind]
172 | 
173 | 
174 | 
175 | class Logger:
176 |     def __init__(self, name='model', fmt=None, base="./logs"):
177 |         self.handler = True
178 |         self.scalar_metrics = OrderedDict()
179 |         self.fmt = fmt if fmt else dict()
180 |         if not os.path.exists(base): os.makedirs(base)
181 |         self.path = os.path.join(base, name + "_" + str(time.time()))
182 |         self.logs = self.path + '.csv'
183 |         self.output = self.path + '.out'
184 | 
185 | 
186 |         def prin(*args):
187 |             str_to_write = ' '.join(map(str, args))
188 |             with open(self.output, 'a') as f:
189 |                 f.write(str_to_write + '\n')
190 |                 f.flush()
191 | 
192 |             print(str_to_write)
193 |             sys.stdout.flush()
194 | 
195 |         self.print = prin
196 | 
197 |     def add_scalar(self, t, key, value):
198 |         if key not in self.scalar_metrics:
199 |             self.scalar_metrics[key] = []
200 |         self.scalar_metrics[key] += [(t, value)]
201 | 
202 |     def iter_info(self, order=None):
203 |         names = list(self.scalar_metrics.keys())
204 |         if order:
205 |             names = order
206 |         values = [self.scalar_metrics[name][-1][1] for name in names]
207 |         t = int(np.max([self.scalar_metrics[name][-1][0] for name in names]))
208 |         fmt = ['%s'] + [self.fmt[name] if name in self.fmt else '.1f' for name in names]
209 | 
210 |         if self.handler:
211 |             self.handler = False
212 |             self.print(tabulate([[t] + values], ['epoch'] + names, floatfmt=fmt))
213 |         else:
214 |             self.print(tabulate([[t] + values], ['epoch'] + names, tablefmt='plain', floatfmt=fmt).split('\n')[1])
215 | 
216 |     def save(self):
217 |         result = None
218 |         for key in self.scalar_metrics.keys():
219 |             if result is None:
220 |                 result = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t')
221 |             else:
222 |                 df = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t')
223 |                 result = result.join(df, how='outer')
224 |         result.to_csv(self.logs)
225 | 
226 |         self.print('The log/output have been saved to: ' + self.path + ' + .csv/.out')
227 | 
228 | class ImageNet():
229 |     def __init__(self, path, one_hot=True, shuffle=False):
230 |         self.x_train, self.y_train, self.x_test, self.y_test = self.load_data(path, one_hot)
231 |         self.num_train = self.x_train.shape[0]
232 |         self.num_test = self.x_test.shape[0]
233 |         if shuffle: self.shuffle_data()
234 |         
235 |         
236 |     def load_data(self, path, one_hot):
237 |         dog_fish = np.load(os.path.join(path, 'dataset_dog-fish_train-900_test-300.npz'))
238 |         x_test = dog_fish[dog_fish.files[0]]
239 |         x_train = dog_fish[dog_fish.files[1]]
240 |         y_train = dog_fish[dog_fish.files[2]]
241 |         y_test = dog_fish[dog_fish.files[3]]
242 |         
243 |         
244 |         if one_hot:
245 |             # convert to one-hot labels
246 |             y_train = tf.keras.utils.to_categorical(y_train)
247 |             y_test = tf.keras.utils.to_categorical(y_test)
248 |         return x_train, y_train, x_test, y_test
249 |    
250 |     def shuffle_data(self):
251 |         ind = np.random.permutation(self.num_train)
252 |         self.x_train, self.y_train = self.x_train[ind], self.y_train[ind]    
253 |         
254 | def add_noise(data, bs, target_snr, noise_type):
255 |     if noise_type == 'white':
256 |         noise = acoustics.generator.white(bs*28*28).reshape(28, 28, bs)
257 |     if noise_type == 'pink':
258 |         noise = acoustics.generator.pink(bs*28*28).reshape(28, 28, bs)
259 |     if noise_type == 'Violet':
260 |         noise = acoustics.generator.violet(bs*28*28).reshape(28, 28, bs)
261 |         
262 |    
263 | 
264 |     print ('data shape = ', data.shape)
265 |     average = np.mean(data)
266 |     std = np.std(noise)
267 |     current_snr = average/std
268 |     noise = noise * (current_snr/ target_snr)
269 |     data = data + noise
270 |     return data
271 |         
272 | def test_mnist():
273 |     print ("Testing MNIST dataloader...")
274 |     data = MNIST()
275 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
276 |     data = MNIST(one_hot=False)
277 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
278 |     print (data.y_train[0:10])
279 |     data = MNIST(shuffle=True, one_hot=False)
280 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
281 |     print (data.y_train[0:10])
282 |     data = MNIST(one_hot=False)
283 |     fig=plt.figure(figsize=(8, 8))
284 |     for i in range(1, 6):
285 | #         img = data.x_train[i].reshape(1,28,28).transpose([1, 2, 0])
286 |         
287 |         img = data.x_train[i]
288 |         img = add_noise(img, 1, 0.2, 'white')
289 |         fig.add_subplot( 1, 5, i)
290 |         plt.imshow(img.squeeze())
291 |     plt.show()
292 | 
293 | def test_cifar10():
294 |     print ("Testing CIFAR10 dataloader...")
295 |     data = CIFAR10()
296 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
297 |     data = CIFAR10(one_hot=False)
298 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
299 |     print (data.y_train[0:10])
300 |     data = CIFAR10(shuffle=True, one_hot=False)
301 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
302 |     print (data.y_train[0:10])
303 |     fig=plt.figure(figsize=(8, 8))
304 |     for i in range(1, 6):
305 |         img = data.x_train[i] * 255
306 |         fig.add_subplot( 1, 5, i)
307 |         plt.imshow(img.astype(np.uint8))
308 |     plt.show()
309 | 
310 |     
311 | def test_imagenet():
312 |     print("Testing ImageNet dataloader...")
313 |     data = ImageNet('./data')
314 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
315 |     data = ImageNet(path='./data', one_hot=False)
316 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
317 |     print (data.y_train[0:10])
318 |     data = ImageNet(path='./data', shuffle=True, one_hot=False)
319 |     print (data.x_train.shape, data.y_train.shape, data.x_test.shape, data.y_test.shape)
320 |     print (data.y_train[0:10]) 
321 |     fig=plt.figure(figsize=(8, 8))
322 |     for i in range(1, 6):
323 | #         img = data.x_train[i].reshape(3,299,299).transpose(1,2,0).astype("float")
324 |         img = data.x_train[i] * -255
325 |         fig.add_subplot( 1, 5, i)
326 |         plt.imshow((img.squeeze()* 255).astype(np.uint8), interpolation='nearest')
327 |     plt.show()
328 |     
329 |     
330 | def train(model, device, x_train, y_train, batch_size, optimizer, criterion, n_epochs):
331 |     model.train()
332 |     for epoch in tqdm_notebook(range(n_epochs), desc = 'Epochs'):
333 | #         print("epoch model.fc.weight:")
334 | #         print(epoch, model.fc.weight)
335 |         for X, y in batch(x_train, y_train, batch_size):  
336 |             X, y = X.to(device).float(), y.to(device)
337 | #             print(X.shape, y.shape)
338 |             optimizer.zero_grad()
339 | #             y_pred = model(X)
340 |             *_, y_pred = model(X)
341 |             loss = criterion(y_pred, y)
342 |             loss.backward()
343 | #             for param in model.parameters():
344 | #                 print(param.grad.data.sum())
345 |             optimizer.step()
346 | #         if(n_epochs > 4):
347 | #             if(epoch % int(n_epochs/4) == 0):
348 | #                 print(f'Train epoch {epoch}: Loss: {loss.item():7.4f}')
349 | 
350 | def evaluate(model, device, x_test, y_test, batch_size, criterion):
351 |     model.eval()
352 |     test_set_size = len(x_test)
353 |     correct_answers = 0
354 |     sum_loss = 0
355 |     with torch.no_grad():
356 |         for X, y in batch(x_test, y_test, batch_size):
357 |             X, y = X.to(device).float(), y.to(device)
358 |             *_, y_pred = model(X)
359 | #             y_pred = model(X)
360 |             
361 |             class_pred = y_pred.argmax(dim = 1)
362 |             correct_answers += (y == class_pred).float().sum().item()
363 |             sum_loss += criterion(y_pred, y).item()
364 |     accuracy = correct_answers / test_set_size
365 |     average_loss = sum_loss / len(x_test)
366 |     
367 |     return accuracy, average_loss    
368 | 
369 | def evaluate_adv(model, device, x_test, y_test, batch_size, criterion):
370 |     model.eval()
371 |     test_set_size = len(x_test)
372 |     correct_answers = 0
373 |     sum_loss = 0
374 |     idx = 0
375 |     idxs = []
376 |     falses = []
377 |     ground_truths = []
378 |     with torch.no_grad():
379 |         for X, y in tqdm_notebook(batch(x_test, y_test, batch_size), total = int(len(x_test)/batch_size)):
380 |             X, y = X.to(device).float(), y.to(device)
381 |             *_, y_pred = model(X)
382 |             class_pred = y_pred.argmax(dim = 1)
383 |             correct_answers += (y == class_pred).float().sum().item()
384 | #             print(y)
385 | #             print(class_pred)
386 |             if( y != class_pred):
387 |                 idxs.append(idx)
388 |                 falses.append(class_pred)
389 |                 ground_truths.append(y)
390 |             idx += 1
391 |             sum_loss += criterion(y_pred, y).item()
392 |     accuracy = correct_answers / test_set_size
393 |     average_loss = sum_loss / len(x_test)
394 |     
395 |     return accuracy, average_loss, falses, ground_truths, idxs
396 | 
397 | def knn_shapley(K, trainX, valX, trainy, valy):        
398 |     N = trainX.shape[0]
399 |     M = valX.shape[0]
400 |     c = 1
401 | #     value = np.zeros(N)
402 |     value = [[] for i in range(N) ]
403 |     scores = []
404 |     false_result_idxs = []
405 |     for i in tqdm_notebook(range(M), total=M, leave=False):
406 |         X = valX[i]
407 |         y = valy[i]
408 | 
409 |         s = np.zeros(N)
410 |         diff = (trainX - X).reshape(N, -1) # calculate the distances between valX and every trainX data point
411 |         dist = np.einsum('ij, ij->i', diff, diff) # output the sum distance
412 |         idx = np.argsort(dist) # ascend the distance
413 |         ans = trainy[idx]
414 | 
415 |         # calculate test performance
416 |         score = 0.0
417 |         
418 |         for j in range(min(K, N)):
419 |             score += float(ans[j] == y)
420 |         if(score > min(K, N)/2):
421 |             scores.append(1)
422 |         else:
423 |             scores.append(0)
424 |             false_result_idxs.append(i)
425 |         
426 |         s[idx[N - 1]] = float(ans[N - 1] == y)*c / N
427 |         cur = N - 2
428 |         for j in range(N - 1):
429 |             s[idx[cur]] = s[idx[cur + 1]] + float(int(ans[cur] == y) - int(ans[cur + 1] == y))*c / K * (min(cur, K - 1) + 1) / (cur + 1)
430 |             cur -= 1
431 |         
432 |         for j in range(N):
433 |             value[j].append(s[j])
434 | #     for i in range(N):
435 | #         value[i] /= M  
436 |     return value, np.mean(scores), false_result_idxs
437 | 
438 | def old_knn_shapley(K, trainX, valX, trainy, valy):        
439 |     N = trainX.shape[0]
440 |     M = valX.shape[0]
441 |     c = 1
442 |     value = np.zeros(N)
443 | #     value = [[] for i in range(N) ]
444 |     scores = []
445 |     false_result_idxs = []
446 |     for i in tqdm_notebook(range(M), total=M, leave=False):
447 |         X = valX[i]
448 |         y = valy[i]
449 | 
450 |         s = np.zeros(N)
451 |         diff = (trainX - X).reshape(N, -1) # calculate the distances between valX and every trainX data point
452 |         dist = np.einsum('ij, ij->i', diff, diff) # output the sum distance
453 |         idx = np.argsort(dist) # ascend the distance
454 |         ans = trainy[idx]
455 | 
456 |         # calculate test performance
457 |         score = 0.0
458 |         
459 |         for j in range(min(K, N)):
460 |             score += float(ans[j] == y)
461 |         if(score > min(K, N)/2):
462 |             scores.append(1)
463 |         else:
464 |             scores.append(0)
465 |             false_result_idxs.append(i)
466 |         
467 |         s[idx[N - 1]] = float(ans[N - 1] == y)*c / N
468 |         cur = N - 2
469 |         for j in range(N - 1):
470 |             s[idx[cur]] = s[idx[cur + 1]] + float(int(ans[cur] == y) - int(ans[cur + 1] == y))*c / K * (min(cur, K - 1) + 1) / (cur + 1)
471 |             cur -= 1
472 |         
473 |         for j in range(N):
474 |             value[j] += s[j]
475 |     for i in range(N):
476 |         value[i] /= M  
477 |     return value, np.mean(scores), false_result_idxs
478 | 
479 | 
480 | 
481 | def loo_knn_shapley(K, trainX, valX, trainy, valy):        
482 |     N = trainX.shape[0]
483 |     M = valX.shape[0]
484 |     value = np.zeros(N)
485 |     scores = []
486 |     false_result_idxs = []
487 |     for i in tqdm_notebook(range(M), total=M, leave=False):
488 |         X = valX[i]
489 |         y = valy[i]
490 | 
491 |         s = np.zeros(N)
492 |         diff = (trainX - X).reshape(N, -1) # calculate the distances between valX and every trainX data point
493 |         dist = np.einsum('ij, ij->i', diff, diff) # output the sum distance
494 |         idx = np.argsort(dist) # ascend the distance
495 |         ans = trainy[idx]
496 | #         print(y, ans[:10])
497 | 
498 |         # calculate test performance
499 |         score = 0.0
500 |         
501 |         for j in range(min(K, N)):
502 |             score += float(ans[j] == y)
503 |         if(score > min(K, N)/2):
504 |             scores.append(1)
505 |         else:
506 |             scores.append(0)
507 |             false_result_idxs.append(i)
508 |             
509 |         ### calculate LOO KNN values and do not concern the situation that K > N
510 |         for j in range(N):
511 |             if j in idx[:K]:
512 | #                 print(int(ans[j] == y), int(ans[K] == y))
513 | #                 print(y, j, ans[j], K, ans[K])
514 |                 s[j] = float(int(trainy[j] == y) - int(trainy[K] == y)) / K
515 |             else:
516 |                 s[j] = 0
517 |         
518 |         
519 |         for j in range(N):
520 |             value[j] += s[j]
521 |     for i in range(N):
522 |         value[i] /= M  
523 |     return value, np.mean(scores), false_result_idxs
524 | 
525 | 
526 | 
527 | def batch(x_batch, y_batch, batch_size=1):
528 |     l = len(x_batch)
529 |     for ndx in range(0, l, batch_size):
530 |         yield x_batch[ndx:min(ndx + batch_size, l)], y_batch[ndx:min(ndx + batch_size, l)]
531 | 
532 | def print_img(img):
533 |     plt.imshow(img.squeeze())
534 |     plt.show()
535 |     
536 | def resize_and_scale(img, size, scale):
537 |     img = cv2.resize(img, size)
538 |     return 1 - np.array(img, "float32")/scale
539 | 
540 | def h5load(path):
541 |     # data means x, target means y
542 |     if(os.path.exists(path)):
543 |         with h5py.File(path, 'r') as hf:
544 |             X_tr = hf.get('data')[:]
545 |             y_tr = hf.get('target')[:]
546 |             return X_tr, y_tr
547 |         
548 | def h5save(path, x, y):
549 |     if(os.path.exists(path)):
550 |         print("Already existed")
551 |         return
552 |     else:
553 |         with h5py.File(path, 'w') as hf:
554 |             hf.create_dataset("data",  data=x, compression="gzip", compression_opts=9)
555 |             print("Data saved!")
556 |             hf.create_dataset("target", data=y, compression="gzip", compression_opts=9)
557 |             print("Target saved!")
558 |     
559 |         return
560 | 
561 | def cw_l2_attack(model, images, labels, device, targeted=False, c=1e-4, kappa=1, max_iter=1000, learning_rate=0.01) :
562 |     images = images.to(device)     
563 |     labels = labels.to(device)
564 |     # Define f-function
565 |     def f(x) :
566 |         *_, outputs = model(x)
567 |         one_hot_labels = torch.eye(len(outputs[0]))[labels].to(device)
568 |         i, _ = torch.max((1-one_hot_labels)*outputs, dim=1)
569 |         j = torch.masked_select(outputs, one_hot_labels.byte())
570 | #         print(i,j)
571 |         # If targeted, optimize for making the other class most likely 
572 |         if targeted :
573 |             return torch.clamp(i-j, min=-kappa)
574 |         # If untargeted, optimize for making the other class most likely 
575 |         else :
576 |             return torch.clamp(j-i, min=-kappa)   
577 |     w = torch.zeros_like(images, requires_grad=True).to(device)
578 |     optimizer = optim.Adam([w], lr=learning_rate)
579 |     prev = 1e10
580 |     for step in range(max_iter) :
581 |         a = 1/2*(nn.Tanh()(w) + 1)
582 |         loss1 = nn.MSELoss(reduction='sum')(a, images)
583 |         loss2 = torch.sum(c*f(a))
584 |         cost = loss1 + loss2
585 |         optimizer.zero_grad()
586 |         cost.backward()
587 | #         print(cost)
588 |         optimizer.step()
589 |         # Early Stop when loss does not converge.
590 |         if step % (max_iter//10) == 0 :
591 |             if cost > prev :
592 |                 print('Attack Stopped due to CONVERGENCE....')
593 |                 return a
594 |             prev = cost      
595 |         print('- Learning Progress : %2.2f %%        ' %((step+1)/max_iter*100), end='\r')
596 |     attack_images = 1/2*(nn.Tanh()(w) + 1)
597 |     return attack_images
598 | 
599 | def load_filenames_labels(mode):
600 |     """Gets filenames and labels
601 |     Args:
602 |       mode: 'train' or 'val'
603 |       (Directory structure and file naming different for
604 |       train and val datasets)
605 |     Returns:
606 |       list of tuples: (jpeg filename with path, label)
607 |     """
608 |     label_dict, class_description = build_label_dicts()
609 |     filenames_labels = []
610 |     if mode == 'train':
611 |         filenames = glob.glob('data/tiny-imagenet-200/train/*/images/*.JPEG')
612 |         for filename in filenames:
613 |             match = re.search(r'n\d+', filename)
614 |             label = str(label_dict[match.group()])
615 |             filenames_labels.append((filename, label))
616 |     elif mode == 'val':
617 |         with open('data/tiny-imagenet-200/val/val_annotations.txt', 'r') as f:
618 |             for line in f.readlines():
619 |                 split_line = line.split('\t')
620 |                 filename = 'data/tiny-imagenet-200/val/images/' + split_line[0]
621 |                 label = str(label_dict[split_line[1]])
622 |                 filenames_labels.append((filename, label))
623 | 
624 |     return filenames_labels
625 | 
626 | def build_label_dicts():
627 |     """Build look-up dictionaries for class label, and class description
628 |   Class labels are 0 to 199 in the same order as 
629 |     tiny-imagenet-200/wnids.txt. Class text descriptions are from 
630 |     tiny-imagenet-200/words.txt
631 |   Returns:
632 |     tuple of dicts
633 |       label_dict: 
634 |         keys = synset (e.g. "n01944390")
635 |         values = class integer {0 .. 199}
636 |       class_desc:
637 |         keys = class integer {0 .. 199}
638 |         values = text description from words.txt
639 |     """
640 |     label_dict, class_description = {}, {}
641 |     with open('data/tiny-imagenet-200/wnids.txt', 'r') as f:
642 |         for i, line in enumerate(f.readlines()):
643 |             synset = line[:-1]  # remove \n
644 |             label_dict[synset] = i
645 |         with open('data/tiny-imagenet-200/words.txt', 'r') as f:
646 |             for i, line in enumerate(f.readlines()):
647 |                 synset, desc = line.split('\t')
648 |                 desc = desc[:-1]  # remove \n
649 |                 if synset in label_dict:
650 |                     class_description[label_dict[synset]] = desc
651 | 
652 |     return label_dict, class_description
653 | 
654 | def load_tinyImagenet(dataset):
655 |     dim = np.zeros((64,64))
656 |     imgs = []
657 |     labels = []
658 |     for path, label in dataset:
659 |         img=np.array(Image.open(path)) /255.0
660 | #         print(path, len(img.shape))
661 |         if(len(img.shape) != 3):
662 |             img = np.stack((img, dim, dim), axis=2)       
663 |         imgs.append(img)
664 |         labels.append(int(label))
665 |     return imgs, labels


--------------------------------------------------------------------------------