├── utils.py ├── LICENSE ├── README.md └── tmc.py /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | def error(mem): 6 | if len(mem) < 100: 7 | return 1.0 8 | all_vals = (np.cumsum(mem, 0)/np.reshape(np.arange(1, len(mem)+1), (-1,1)))[-100:] 9 | errors = np.mean(np.abs(all_vals[-100:] - all_vals[-1:])/(np.abs(all_vals[-1:]) + 1e-12), -1) 10 | errors = np.max(errors) 11 | print('Error: {}'.format(errors)) 12 | return errors 13 | 14 | def accuracy(logits, labels): 15 | correct, total = 0, 0 16 | with torch.no_grad(): 17 | preds = F.softmax(logits, dim=1) 18 | predicted = torch.argmax(preds, dim=1) 19 | total += labels.size(0) 20 | correct += (predicted == labels).sum().item() 21 | return correct / total -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Ayrton San Joaquin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Equitable Valuation of Data Using Shapley Values (PyTorch Implementation) 2 | ### Note: the implementation is currently lacking a retraining step. I welcome any PRs to fix this. See [#1](https://github.com/ajsanjoaquin/Shapley_Valuation/issues/1). 3 | 4 | This is a PyTorch reimplementation of Computing Shapley Values via Truncated Monte Carlo sampling from 5 | [What is your data worth? Equitable Valuation of Data](https://arxiv.org/abs/1904.02868) by Amirata Ghorbani and James Zou. 6 | The original implementation (In Tensorflow) can be found [here](https://github.com/amiratag/DataShapley). 7 | 8 | This implementation is currently designed for neural networks, and the only available performance metric is model classification accuracy, 9 | but contributions to expand the implementation are welcome. 10 | 11 | - [Why Compute Shapley Values?](#why-compute-shapley-values?) 12 | - [Requirements](#requirements) 13 | - [Usage](#usage) 14 | 15 | ## Why Compute Shapley Values? 16 | 17 | Computing Shapley values help when you need a system to rank the importance of your training data, 18 | which may arise when you need to prune your training data of harmful images, 19 | or when you need to provide compensation for data from multiple sources. 20 | 21 | It differs from computing the value based on the leave-one-out method (LOO), 22 | because Shapley values satisfy three main properties: 23 | 24 | 1. Null Data: If a datum does not change the model performance if it is added to any subset of the training data, then its value is zero. 25 | 2. Equality: For any data x & y, if x has equal contribution to y when added to any subset of the training data, then x and y have the same Shapley value. 26 | 3. Additive: If datum x contributes S_x(d_1) and S_x(d_2) to test data 1 and 2, respectively, then the value of x for both points is S_x(d_1) + S_x(d_2). 27 | 28 | ## Requirements 29 | 30 | * Python 3.6 or later 31 | * PyTorch 1.0 or later 32 | * NumPy 1.12 or later 33 | * Pickle 34 | * Tqdm 35 | 36 | ## Usage 37 | 38 | ```python 39 | from tmc import DShap 40 | 41 | # Supplied by the user: 42 | model = get_my_model() 43 | train_set, test_set = get_my_datasets() 44 | 45 | dshap = DShap(model, train_set, testset, directory='your_directory') 46 | 47 | dshap.run(save_every=100, err=0.1, tolerance=0.01) 48 | ``` 49 | 50 | This outputs a pickle file that contains the sampled Shapley Values. You can convert this into a numpy array of dimensions (Iterations x # of Training Points). 51 | 52 | 53 | ## LICENSE 54 | Creative Commons Licence 55 | -------------------------------------------------------------------------------- /tmc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pickle as pkl 4 | from tqdm.notebook import tqdm 5 | 6 | import torch 7 | from torch.utils.data import RandomSampler, DataLoader 8 | 9 | from .utils import accuracy, error 10 | 11 | device = ('cuda' if torch.cuda.is_available() else 'cpu') 12 | class DShap(object): 13 | 14 | def __init__(self, model, train_dataset, test_dataset, 15 | directory=None, seed=10): 16 | """ 17 | Args: 18 | model: Torch model 19 | train_dataset: Training Dataset (torch.Dataset) 20 | test_dataset: Test Dataset (torch.Dataset) 21 | directory: Directory to save results and figures. 22 | seed: Random seed. When running parallel monte-carlo samples, 23 | we initialize each with a different seed to prevent getting 24 | same permutations. 25 | """ 26 | 27 | if seed is not None: 28 | np.random.seed(seed) 29 | 30 | self.directory = directory 31 | if self.directory is not None: 32 | if not os.path.exists(directory): 33 | os.makedirs(directory) 34 | 35 | self.model = model 36 | self.train_set = train_dataset 37 | self.test_set = test_dataset 38 | self.train_len = len(self.train_set) 39 | 40 | self.mem_tmc = np.zeros((0, self.train_len)) 41 | self.idxs_tmc = np.zeros((0, self.train_len), int) 42 | test_classes = torch.tensor([label for _, label in self.test_set]) 43 | self.random_score = torch.max(torch.bincount(test_classes) / len(self.test_set) ).item() 44 | 45 | self.tmc_number = self._which_parallel(self.directory) 46 | self._create_results_placeholder(self.directory, self.tmc_number) 47 | 48 | def _create_results_placeholder(self, directory, tmc_number): 49 | tmc_dir = os.path.join( 50 | directory, 51 | 'mem_tmc_{}.pkl'.format(tmc_number.zfill(4)) 52 | ) 53 | pkl.dump({'mem_tmc': self.mem_tmc, 'idxs_tmc': self.idxs_tmc}, 54 | open(tmc_dir, 'wb')) 55 | 56 | def run(self, save_every, err, tolerance=0.01): 57 | """Calculates data sources(points) values. 58 | 59 | Args: 60 | save_every: save marginal contributions every n iterations. 61 | err: stopping criteria. 62 | tolerance: Truncation tolerance. If None, it's computed. 63 | """ 64 | 65 | #self.results = {} 66 | tmc_run = True 67 | while tmc_run: 68 | if error(self.mem_tmc) < err: 69 | tmc_run = False 70 | else: 71 | self.tmc_shap( 72 | save_every, 73 | tolerance=tolerance, 74 | ) 75 | self.vals_tmc = np.mean(self.mem_tmc, 0) 76 | self.save_results() 77 | 78 | 79 | def save_results(self): 80 | """Saves results computed so far.""" 81 | if self.directory is None: 82 | return 83 | tmc_dir = os.path.join( 84 | self.directory, 85 | 'mem_tmc_{}.pkl'.format(self.tmc_number.zfill(4)) 86 | ) 87 | 88 | raw_list = [] 89 | for j in range(self.mem_tmc.shape[0]): 90 | row_dict = { self.idxs_tmc[j][i] : self.mem_tmc[j][i] for i in range(self.mem_tmc.shape[1]) } 91 | raw_list.append(row_dict) 92 | 93 | pkl.dump(raw_list, open(tmc_dir, 'wb')) 94 | #pkl.dump({'mem_tmc': self.mem_tmc, 'idxs_tmc': self.idxs_tmc}, 95 | # open(tmc_dir, 'wb')) 96 | 97 | def _which_parallel(self, directory): 98 | '''Prevent conflict with parallel runs.''' 99 | previous_results = os.listdir(directory) 100 | tmc_nmbrs = [int(name.split('.')[-2].split('_')[-1]) 101 | for name in previous_results if 'mem_tmc' in name] 102 | tmc_number = str(np.max(tmc_nmbrs) + 1) if len(tmc_nmbrs) else '0' 103 | return tmc_number 104 | 105 | def tmc_shap(self, iterations, tolerance=0.01): 106 | """Runs TMC-Shapley algorithm. 107 | 108 | Args: 109 | iterations: Number of iterations to run. 110 | tolerance: Truncation tolerance ratio. 111 | """ 112 | self._tol_mean_score() 113 | 114 | marginals, idxs = [], [] 115 | for _ in tqdm(range(iterations)): 116 | 117 | marginals, idxs = self.one_iteration( 118 | tolerance=tolerance 119 | ) 120 | self.mem_tmc = np.concatenate([ 121 | self.mem_tmc, 122 | np.reshape(marginals, (1,-1)) # dims: (some number, # train samples) 123 | ]) 124 | self.idxs_tmc = np.concatenate([ 125 | self.idxs_tmc, 126 | np.reshape(idxs, (1,-1)) 127 | ]) 128 | 129 | 130 | def one_iteration(self, tolerance): 131 | """Runs one iteration of TMC-Shapley algorithm.""" 132 | idxs = np.random.permutation(self.train_len) #Re read algorithm. We can get random sampler with a dataloader instead 133 | marginal_contribs = np.zeros(self.train_len) 134 | 135 | truncation_counter = 0 136 | new_score = self.random_score 137 | self.model.train() 138 | 139 | # Iterates through the entire Training dataset 140 | data_list = [] 141 | label_list = [] 142 | for i, idx in enumerate(idxs): 143 | old_score = new_score 144 | data_list.append(self.train_set[idx][0]) 145 | label_list.append(torch.tensor(self.train_set[idx][1])) 146 | if i == 0: 147 | data = self.train_set[idx][0].unsqueeze(0) 148 | labels = torch.tensor([self.train_set[idx][1]]) 149 | else: 150 | data = torch.stack(data_list, 0) 151 | labels = torch.stack(label_list, 0) 152 | 153 | data, labels = data.to(device), labels.to(device) 154 | new_score = accuracy(self.model(data), labels) 155 | 156 | marginal_contribs[idx] = (new_score - old_score) # original code divides by 1 for some reason 157 | distance_to_full_score = np.abs(new_score - self.mean_score) 158 | # Performance Tolerance 159 | if distance_to_full_score <= tolerance * self.mean_score: 160 | truncation_counter += 1 161 | if truncation_counter > 5: 162 | break 163 | else: 164 | truncation_counter = 0 165 | return marginal_contribs, idxs 166 | 167 | 168 | def _tol_mean_score(self): 169 | """Computes the average performance and its error using bagging.""" 170 | scores = [] 171 | self.model.eval() 172 | for _ in range(100): 173 | #bag_idxs = np.random.choice(len(self.y_test), len(self.y_test)) # check size 174 | 175 | sampler = RandomSampler(self.test_set) 176 | loader = DataLoader(self.test_set, batch_size=512, num_workers=2, sampler=sampler) 177 | 178 | # 1-pass 179 | for data, labels in loader: 180 | data, labels = data.to(device), labels.to(device) 181 | acc = accuracy(self.model(data), labels) 182 | scores.append(acc) 183 | break 184 | 185 | self.tol = np.std(scores) 186 | self.mean_score = np.mean(scores) --------------------------------------------------------------------------------