├── utils.py
├── LICENSE
├── README.md
└── tmc.py


/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | def error(mem):
 6 |     if len(mem) < 100:
 7 |         return 1.0
 8 |     all_vals = (np.cumsum(mem, 0)/np.reshape(np.arange(1, len(mem)+1), (-1,1)))[-100:]
 9 |     errors = np.mean(np.abs(all_vals[-100:] - all_vals[-1:])/(np.abs(all_vals[-1:]) + 1e-12), -1)
10 |     errors = np.max(errors)
11 |     print('Error: {}'.format(errors))
12 |     return errors
13 | 
14 | def accuracy(logits, labels):
15 |     correct, total = 0, 0
16 |     with torch.no_grad():
17 |         preds = F.softmax(logits, dim=1)
18 |         predicted = torch.argmax(preds, dim=1)
19 |     total += labels.size(0)
20 |     correct += (predicted == labels).sum().item()
21 |     return correct / total


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Ayrton San Joaquin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Equitable Valuation of Data Using Shapley Values (PyTorch Implementation)
 2 | ### Note: the implementation is currently lacking a retraining step. I welcome any PRs to fix this. See [#1](https://github.com/ajsanjoaquin/Shapley_Valuation/issues/1).
 3 | 
 4 | This is a PyTorch reimplementation of Computing Shapley Values via Truncated Monte Carlo sampling from 
 5 | [What is your data worth? Equitable Valuation of Data](https://arxiv.org/abs/1904.02868) by Amirata Ghorbani and James Zou.
 6 | The original implementation (In Tensorflow) can be found [here](https://github.com/amiratag/DataShapley).
 7 | 
 8 | This implementation is currently designed for neural networks, and the only available performance metric is model classification accuracy, 
 9 | but contributions to expand the implementation are welcome. 
10 | 
11 | - [Why Compute Shapley Values?](#why-compute-shapley-values?)
12 | - [Requirements](#requirements)
13 | - [Usage](#usage)
14 | 
15 | ## Why Compute Shapley Values?
16 | 
17 | Computing Shapley values help when you need a system to rank the importance of your training data, 
18 | which may arise when you need to prune your training data of harmful images, 
19 | or when you need to provide compensation for data from multiple sources.
20 | 
21 | It differs from computing the value based on the leave-one-out method (LOO), 
22 | because Shapley values satisfy three main properties:
23 | 
24 | 1. Null Data: If a datum does not change the model performance if it is added to any subset of the training data, then its value is zero.
25 | 2. Equality: For any data x & y, if x has equal contribution to y when added to any subset of the training data, then x and y have the same Shapley value. 
26 | 3. Additive: If datum x contributes S_x(d_1) and S_x(d_2) to test data 1 and 2, respectively, then the value of x for both points is S_x(d_1) + S_x(d_2).
27 | 
28 | ## Requirements
29 | 
30 | * Python 3.6 or later
31 | * PyTorch 1.0 or later
32 | * NumPy 1.12 or later
33 | * Pickle
34 | * Tqdm
35 | 
36 | ## Usage
37 | 
38 | ```python
39 | from tmc import DShap
40 | 
41 | # Supplied by the user:
42 | model = get_my_model()
43 | train_set, test_set = get_my_datasets()
44 | 
45 | dshap = DShap(model, train_set, testset, directory='your_directory')
46 | 
47 | dshap.run(save_every=100, err=0.1, tolerance=0.01)
48 | ```
49 | 
50 | This outputs a pickle file that contains the sampled Shapley Values. You can convert this into a numpy array of dimensions (Iterations x # of Training Points).
51 | 
52 | 
53 | ## LICENSE
54 | <a rel="license" href="https://opensource.org/licenses/MIT"><img alt="Creative Commons Licence" style="border-width:0" src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/MIT_logo.svg/220px-MIT_logo.svg.png" /></a>
55 | 


--------------------------------------------------------------------------------
/tmc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np 
  3 | import pickle as pkl
  4 | from tqdm.notebook import tqdm
  5 | 
  6 | import torch
  7 | from torch.utils.data import RandomSampler, DataLoader
  8 | 
  9 | from .utils import accuracy, error   
 10 | 
 11 | device = ('cuda' if torch.cuda.is_available() else 'cpu')
 12 | class DShap(object):
 13 |     
 14 |     def __init__(self, model, train_dataset, test_dataset,
 15 |                  directory=None, seed=10):
 16 |         """
 17 |         Args:
 18 |             model: Torch model
 19 |             train_dataset: Training Dataset (torch.Dataset)
 20 |             test_dataset: Test Dataset (torch.Dataset)
 21 |             directory: Directory to save results and figures.
 22 |             seed: Random seed. When running parallel monte-carlo samples,
 23 |                 we initialize each with a different seed to prevent getting 
 24 |                 same permutations.
 25 |         """
 26 |             
 27 |         if seed is not None:
 28 |             np.random.seed(seed)
 29 | 
 30 |         self.directory = directory
 31 |         if self.directory is not None:
 32 |             if not os.path.exists(directory):
 33 |                 os.makedirs(directory)  
 34 | 
 35 |         self.model = model
 36 |         self.train_set = train_dataset
 37 |         self.test_set = test_dataset
 38 |         self.train_len = len(self.train_set)
 39 | 
 40 |         self.mem_tmc = np.zeros((0, self.train_len))
 41 |         self.idxs_tmc = np.zeros((0, self.train_len), int)
 42 |         test_classes = torch.tensor([label for _, label in self.test_set])
 43 |         self.random_score = torch.max(torch.bincount(test_classes) / len(self.test_set) ).item()
 44 | 
 45 |         self.tmc_number = self._which_parallel(self.directory)
 46 |         self._create_results_placeholder(self.directory, self.tmc_number)
 47 | 
 48 |     def _create_results_placeholder(self, directory, tmc_number):
 49 |         tmc_dir = os.path.join(
 50 |             directory, 
 51 |             'mem_tmc_{}.pkl'.format(tmc_number.zfill(4))
 52 |         )
 53 |         pkl.dump({'mem_tmc': self.mem_tmc, 'idxs_tmc': self.idxs_tmc}, 
 54 |                  open(tmc_dir, 'wb'))
 55 | 
 56 |     def run(self, save_every, err, tolerance=0.01):
 57 |         """Calculates data sources(points) values.
 58 |         
 59 |         Args:
 60 |             save_every: save marginal contributions every n iterations.
 61 |             err: stopping criteria.
 62 |             tolerance: Truncation tolerance. If None, it's computed.
 63 |         """
 64 | 
 65 |         #self.results = {}
 66 |         tmc_run = True 
 67 |         while tmc_run:
 68 |             if error(self.mem_tmc) < err:
 69 |                 tmc_run = False
 70 |             else:
 71 |                 self.tmc_shap(
 72 |                     save_every, 
 73 |                     tolerance=tolerance, 
 74 |                 )
 75 |                 self.vals_tmc = np.mean(self.mem_tmc, 0)
 76 |             self.save_results()
 77 |         
 78 |         
 79 |     def save_results(self):
 80 |         """Saves results computed so far."""
 81 |         if self.directory is None:
 82 |             return
 83 |         tmc_dir = os.path.join(
 84 |             self.directory, 
 85 |             'mem_tmc_{}.pkl'.format(self.tmc_number.zfill(4))
 86 |         )
 87 | 
 88 |         raw_list = []
 89 |         for j in range(self.mem_tmc.shape[0]):
 90 |             row_dict = { self.idxs_tmc[j][i] : self.mem_tmc[j][i] for i in range(self.mem_tmc.shape[1]) }
 91 |             raw_list.append(row_dict)
 92 | 
 93 |         pkl.dump(raw_list, open(tmc_dir, 'wb'))
 94 |         #pkl.dump({'mem_tmc': self.mem_tmc, 'idxs_tmc': self.idxs_tmc}, 
 95 |         #         open(tmc_dir, 'wb'))
 96 |     
 97 |     def _which_parallel(self, directory):
 98 |         '''Prevent conflict with parallel runs.'''
 99 |         previous_results = os.listdir(directory)
100 |         tmc_nmbrs = [int(name.split('.')[-2].split('_')[-1])
101 |                       for name in previous_results if 'mem_tmc' in name]      
102 |         tmc_number = str(np.max(tmc_nmbrs) + 1) if len(tmc_nmbrs) else '0' 
103 |         return tmc_number
104 | 
105 |     def tmc_shap(self, iterations, tolerance=0.01):
106 |         """Runs TMC-Shapley algorithm.
107 |         
108 |         Args:
109 |             iterations: Number of iterations to run.
110 |             tolerance: Truncation tolerance ratio.
111 |         """
112 |         self._tol_mean_score()
113 |         
114 |         marginals, idxs = [], []
115 |         for _ in tqdm(range(iterations)):
116 | 
117 |             marginals, idxs = self.one_iteration(
118 |                 tolerance=tolerance
119 |             )
120 |             self.mem_tmc = np.concatenate([
121 |                 self.mem_tmc, 
122 |                 np.reshape(marginals, (1,-1))  # dims: (some number, # train samples)
123 |             ])
124 |             self.idxs_tmc = np.concatenate([
125 |                 self.idxs_tmc, 
126 |                 np.reshape(idxs, (1,-1))
127 |             ])
128 | 
129 | 
130 |     def one_iteration(self, tolerance):
131 |         """Runs one iteration of TMC-Shapley algorithm."""
132 |         idxs = np.random.permutation(self.train_len)                              #Re read algorithm. We can get random sampler with a dataloader instead
133 |         marginal_contribs = np.zeros(self.train_len)
134 | 
135 |         truncation_counter = 0
136 |         new_score = self.random_score
137 |         self.model.train()
138 | 
139 |         #  Iterates through the entire Training dataset
140 |         data_list = []
141 |         label_list = []
142 |         for i, idx in enumerate(idxs):
143 |             old_score = new_score
144 |             data_list.append(self.train_set[idx][0])
145 |             label_list.append(torch.tensor(self.train_set[idx][1]))
146 |             if i == 0:
147 |                 data = self.train_set[idx][0].unsqueeze(0)
148 |                 labels = torch.tensor([self.train_set[idx][1]])
149 |             else:
150 |                 data = torch.stack(data_list, 0)
151 |                 labels = torch.stack(label_list, 0)
152 | 
153 |             data, labels = data.to(device), labels.to(device)
154 |             new_score = accuracy(self.model(data), labels)
155 | 
156 |             marginal_contribs[idx] = (new_score - old_score)  # original code divides by 1 for some reason
157 |             distance_to_full_score = np.abs(new_score - self.mean_score)
158 |             #  Performance Tolerance
159 |             if distance_to_full_score <= tolerance * self.mean_score:
160 |                 truncation_counter += 1
161 |                 if truncation_counter > 5:
162 |                     break
163 |             else:
164 |                 truncation_counter = 0
165 |         return marginal_contribs, idxs
166 | 
167 | 
168 |     def _tol_mean_score(self):
169 |         """Computes the average performance and its error using bagging."""
170 |         scores = []
171 |         self.model.eval()
172 |         for _ in range(100):
173 |             #bag_idxs = np.random.choice(len(self.y_test), len(self.y_test))  # check size
174 |             
175 |             sampler = RandomSampler(self.test_set)
176 |             loader = DataLoader(self.test_set, batch_size=512, num_workers=2, sampler=sampler)
177 | 
178 |             # 1-pass
179 |             for data, labels in loader:
180 |                 data, labels = data.to(device), labels.to(device)
181 |                 acc = accuracy(self.model(data), labels)
182 |                 scores.append(acc)
183 |                 break
184 | 
185 |         self.tol = np.std(scores)
186 |         self.mean_score = np.mean(scores)


--------------------------------------------------------------------------------