├── README.md
├── calibrators.py
├── cifar10h
├── densenet-bc-L190-k40.csv
├── human_model_truth_cifar10h.csv
├── preresnet-110.csv
└── resnet-110.csv
├── combination_methods.py
├── data_utils.py
├── ensemble_ts.py
├── experiments
├── ablation_experiment_v2.py
├── calibrate_combo_experiment.py
├── calibrate_first_experiment.py
├── calibration_experiment.py
├── calibration_method_experiment.py
├── em_experiment.py
├── semisup_em_experiment.py
└── weighted_semisup_em_experiment.py
├── imax_calib
├── __init__.py
├── calibration.py
├── calibrators
│ ├── __init__.py
│ ├── binners.py
│ └── scalers_np.py
├── clustering.py
├── evaluations
│ ├── __init__.py
│ └── calibration_metrics.py
├── hb_utils.py
├── io.py
└── utils.py
├── metrics.py
└── utils.py
/README.md:
--------------------------------------------------------------------------------
1 | # Combining Human Predictions with Model Probabilities via Confusion Matrices and Calibration
2 |
3 | This repo contains the code for our NeurIPS 2021 paper, Combining Human Predictions with Model Probabilities via Confusion Matrices and Calibration [[arxiv](https://arxiv.org/abs/2109.14591)].
4 |
5 | The ImageNet-16H dataset is available on the [OSF](https://osf.io/2ntrf/?view_only=9ec9cacb806d4a1ea4e2f8acaada8f6c). Please also see our work [Bayesian Modeling of Human-AI Complementarity [Steyvers et al., 2022]](https://www.pnas.org/doi/10.1073/pnas.2111547119) describing this dataset in more detail.
6 |
7 | # Project Structure
8 |
9 | - Data for the CIFAR-10h experiments is contained in the `/cifar10h` directory.
10 | - `/experiments` contains various scripts for reproducing the experiments in our paper.
11 | - `calibrators.py` implements various calibration methods.
12 | - `combination_methods.py` implements various combination methods.
13 | - `data_utils.py` contains useful data processing methods.
14 |
15 | # References
16 | Please consider citing our paper as:
17 | ```
18 | @inproceedings{kerrigan2021combining,
19 | title={Combining Human Predictions with Model Probabilities via Confusion Matrices and Calibration},
20 | author={Kerrigan, Gavin and Smyth, Padhraic and Steyvers, Mark},
21 | booktitle={Advances in Neural Information Processing Systems},
22 | year={2021}
23 | }
24 | ```
25 |
--------------------------------------------------------------------------------
/calibrators.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | import numpy as np
3 | import torch
4 | from torch import nn, optim
5 | from torch.distributions.log_normal import LogNormal
6 |
7 | import imax_calib.io as io
8 | import imax_calib.utils as imax_utils
9 | import imax_calib.calibration as imax_calibration
10 |
11 | import pyro
12 | import pyro.distributions as dist
13 | from pyro.infer import MCMC, NUTS
14 | from torch.nn.functional import softmax
15 |
16 | # This file implements various calibration methods.
17 |
18 |
19 | class BaseCalibrator:
20 | """ Abstract calibrator class
21 | """
22 | def __init__(self):
23 | self.n_classes = None
24 |
25 | def fit(self, logits, y):
26 | raise NotImplementedError
27 |
28 | def calibrate(self, probs):
29 | raise NotImplementedError
30 |
31 |
32 | class IdentityCalibrator(BaseCalibrator):
33 | """ A class that implements no recalibration.
34 | """
35 |
36 | def fit(self, probs, y):
37 | return
38 |
39 | def calibrate(self, probs):
40 | return probs
41 |
42 |
43 | class TSCalibrator(BaseCalibrator):
44 | """ Maximum likelihood temperature scaling (Guo et al., 2017)
45 | """
46 |
47 | def __init__(self, temperature=1.):
48 | super().__init__()
49 | self.temperature = temperature
50 |
51 | self.loss_trace = None
52 |
53 | def fit(self, logits, y):
54 | """ Fits temperature scaling using hard labels.
55 | """
56 | # Pre-processing
57 | self.n_classes = logits.shape[1]
58 | _model_logits = torch.from_numpy(logits)
59 | _y = torch.from_numpy(y)
60 | _temperature = torch.tensor(self.temperature, requires_grad=True)
61 |
62 | # Optimization parameters
63 | nll = nn.CrossEntropyLoss() # Supervised hard-label loss
64 | num_steps = 7500
65 | learning_rate = 0.05
66 | grad_tol = 1e-3 # Gradient tolerance for early stopping
67 | min_temp, max_temp = 1e-2, 1e4 # Upper / lower bounds on temperature
68 |
69 | optimizer = optim.Adam([_temperature], lr=learning_rate)
70 |
71 | loss_trace = [] # Track loss over iterations
72 | step = 0
73 | converged = False
74 | while not converged:
75 |
76 | optimizer.zero_grad()
77 | loss = nll(_model_logits / _temperature, _y)
78 | loss.backward()
79 | optimizer.step()
80 | loss_trace.append(loss.item())
81 |
82 | with torch.no_grad():
83 | _temperature.clamp_(min=min_temp, max=max_temp)
84 |
85 | step += 1
86 | if step > num_steps:
87 | warnings.warn('Maximum number of steps reached -- may not have converged (TS)')
88 | converged = (step > num_steps) or (np.abs(_temperature.grad) < grad_tol)
89 |
90 | self.loss_trace = loss_trace
91 | self.temperature = _temperature.item()
92 |
93 | def calibrate(self, probs):
94 | calibrated_probs = probs ** (1. / self.temperature) # Temper
95 | calibrated_probs /= np.sum(calibrated_probs, axis=1, keepdims=True) # Normalize
96 | return calibrated_probs
97 |
98 |
99 | class EnsembleTSCalibrator(BaseCalibrator):
100 | """ Ensemble Temperature Scaling (Zhang et al., 2020)
101 | This is just a thin wrapper around ensemble_ts.py for convenience.
102 | """
103 |
104 | def __init__(self, temperature=1.):
105 | super().__init__()
106 | self.temperature = temperature
107 | self.weights = None
108 |
109 | def calibrate(self, probs):
110 | p1 = probs
111 | tempered_probs = probs ** (1. / self.temperature) # Temper
112 | tempered_probs /= np.sum(tempered_probs, axis=1, keepdims=True) # Normalize
113 | p0 = tempered_probs
114 | p2 = np.ones_like(p0) / self.n_classes
115 |
116 | calibrated_probs = self.weights[0] * p0 + self.weights[1] * p1 + self.weights[2] * p2
117 |
118 | return calibrated_probs
119 |
120 | def fit(self, logits, y):
121 | from ensemble_ts import ets_calibrate
122 | self.n_classes = logits.shape[1]
123 |
124 | # labels need to be one-hot for ETS
125 | _y = np.eye(self.n_classes)[y]
126 |
127 | t, w = ets_calibrate(logits, _y, self.n_classes, loss='mse') # loss = 'ce'
128 | self.temperature = t
129 | self.weights = w
130 |
131 |
132 | class TSCalibratorMAP(BaseCalibrator):
133 | """ MAP Temperature Scaling
134 | """
135 |
136 | def __init__(self, temperature=1., prior_mu=0.5, prior_sigma=0.5):
137 | super().__init__()
138 | self.temperature = temperature
139 | self.loss_trace = None
140 |
141 | self.prior_mu = torch.tensor(prior_mu)
142 | self.prior_sigma = torch.tensor(prior_sigma)
143 |
144 | def fit(self, model_logits, y):
145 | """ Fits temperature scaling using hard labels.
146 | """
147 | # Pre-processing
148 | _model_logits = torch.from_numpy(model_logits)
149 | _y = torch.from_numpy(y)
150 | _temperature = torch.tensor(self.temperature, requires_grad=True)
151 |
152 | prior = LogNormal(self.prior_mu, self.prior_sigma)
153 | # Optimization parameters
154 | nll = nn.CrossEntropyLoss() # Supervised hard-label loss
155 | num_steps = 7500
156 | learning_rate = 0.05
157 | grad_tol = 1e-3 # Gradient tolerance for early stopping
158 | min_temp, max_temp = 1e-2, 1e4 # Upper / lower bounds on temperature
159 |
160 | optimizer = optim.Adam([_temperature], lr=learning_rate)
161 |
162 | loss_trace = [] # Track loss over iterations
163 | step = 0
164 | converged = False
165 | while not converged:
166 |
167 | optimizer.zero_grad()
168 | loss = nll(_model_logits / _temperature, _y)
169 | loss += -1 * prior.log_prob(_temperature) # This step adds the prior
170 | loss.backward()
171 | optimizer.step()
172 | loss_trace.append(loss.item())
173 |
174 | with torch.no_grad():
175 | _temperature.clamp_(min=min_temp, max=max_temp)
176 |
177 | step += 1
178 | if step > num_steps:
179 | warnings.warn('Maximum number of steps reached -- may not have converged (TS)')
180 | converged = (step > num_steps) or (np.abs(_temperature.grad) < grad_tol)
181 |
182 | self.loss_trace = loss_trace
183 | self.temperature = _temperature.item()
184 |
185 | def calibrate(self, probs):
186 | calibrated_probs = probs ** (1. / self.temperature) # Temper
187 | calibrated_probs /= np.sum(calibrated_probs, axis=1, keepdims=True) # Normalize
188 | return calibrated_probs
189 |
190 |
191 | class IMaxCalibrator(BaseCalibrator):
192 | """ I-Max Binning calibration (Patel et al., 2021)
193 | https://arxiv.org/pdf/2006.13092.pdf
194 | """
195 |
196 | def __init__(self, mode='CW', num_bins=15):
197 | super().__init__()
198 | # mode in ['cw', 'sCW', 'top1']
199 | self.cfg = io.AttrDict(dict(
200 | # All
201 | cal_setting=mode, # CW, sCW or top1 # CW seems to be much better than sCW
202 | num_bins=num_bins,
203 | # Binning
204 | Q_method="imax",
205 | Q_binning_stage="raw", # bin the raw logodds or the 'scaled' logodds
206 | Q_binning_repr_scheme="sample_based",
207 | Q_bin_repr_during_optim="pred_prob_based",
208 | Q_rnd_seed=928163,
209 | Q_init_mode="kmeans"
210 | ))
211 | self.calibrator = None
212 |
213 | def calibrate(self, probs):
214 | logits = np.log(np.clip(probs, 1e-50, 1))
215 | logodds = imax_utils.quick_logits_to_logodds(logits, probs=probs)
216 | cal_logits, cal_logodds, cal_probs, assigned = self.calibrator(logits, logodds)
217 | return cal_probs
218 |
219 | def fit(self, logits, y):
220 | n_samples, n_classes = logits.shape
221 | self.n_classes = n_classes
222 | self.cfg['n_classes'] = n_classes
223 | # y must be one-hot
224 | if y.ndim == 1:
225 | y_onehot = np.eye(self.n_classes)[y]
226 | else:
227 | y_onehot = y
228 |
229 | logodds = imax_utils.quick_logits_to_logodds(logits)
230 | self.calibrator = imax_calibration.learn_calibrator(self.cfg,
231 | logits=logits,
232 | logodds=logodds,
233 | y=y_onehot)
234 |
235 |
236 | class BayesianTemperingCalibrator:
237 | """ This class implements the fully Bayesian temperature scaling calibrator.
238 | """
239 |
240 | def __init__(self, prior_params, num_classes, **kwargs):
241 | self.num_classes = num_classes
242 | # Inference parameters
243 | self.NUTS_params = {'adapt_step_size': kwargs.pop('adapt_step_size', True),
244 | 'target_accept_prob': kwargs.pop('target_accept_prob', 0.8),
245 | 'max_plate_nesting': 1
246 | }
247 | self.mcmc_params = {'num_samples': kwargs.pop('num_samples', 250),
248 | 'warmup_steps': kwargs.pop('num_warmup', 1000),
249 | 'num_chains': kwargs.pop('num_chains', 4)
250 | }
251 |
252 | # Prior parameters on beta / delta ; assumes each delta is iid
253 | self.prior_params = {'mu_beta': prior_params['mu_beta'],
254 | 'sigma_beta': prior_params['sigma_beta']}
255 |
256 | # Posterior parameters after ADF
257 | self.posterior_params = {'mu_beta': None,
258 | 'sigma_beta': None}
259 |
260 | # Drift parameters for sequential updating
261 | self.sigma_drift = kwargs.pop('sigma_drift', 0.0)
262 |
263 | # Tracking params
264 | # TODO: Prior/posterior trace
265 | self.timestep = 0
266 | self.mcmc = None # Contains the most recent Pyro MCMC api object
267 | self.verbose = kwargs.pop('verbose', False)
268 |
269 | if self.verbose:
270 | print('\nInitializing BT model:\n'
271 | '----| Prior: {} \n----| Inference Method: NUTS \n'
272 | '----| MCMC parameters: {}'
273 | ''.format(prior_params, self.mcmc_params))
274 |
275 | def fit(self, logits, labels):
276 | """ Performs an update given new observations.
277 |
278 | Args:
279 | logits: tensor ; shape (batch_size, num_classes)
280 | labels: tensor ; shape (batch_size, )
281 | """
282 | assert len(labels.shape) == 1, 'Got label tensor with shape {} -- labels must be dense'.format(labels.shape)
283 | assert len(logits.shape) == 2, 'Got logit tensor with shape {}'.format(logits.shape)
284 | assert (labels.shape[0] == logits.shape[0]), 'Shape mismatch between logits ({}) and labels ({})' \
285 | .format(logits.shape[0], labels.shape[0])
286 |
287 | logits = logits.detach().clone().requires_grad_()
288 | labels = labels.detach().clone()
289 |
290 | batch_size = labels.shape[0]
291 | if self.verbose:
292 | print('----| Updating HBC model\n--------| Got a batch size of: {}'.format(batch_size))
293 |
294 | self._update_prior_params()
295 | if self.verbose:
296 | print('--------| Updated priors: {}'.format(self.prior_params))
297 | print('--------| Running inference ')
298 | nuts_kernel = NUTS(bt_model, **self.NUTS_params)
299 | self.mcmc = MCMC(nuts_kernel, **self.mcmc_params, disable_progbar=not self.verbose,
300 | mp_context='spawn') # Progbar if verbose
301 | self.mcmc.run(self.prior_params, logits, labels)
302 |
303 | self._update_posterior_params()
304 | self.timestep += 1
305 |
306 | return self.mcmc
307 |
308 | def _update_prior_params(self):
309 | """ Updates the prior parameters using the ADF posterior from the previous timestep, plus the drift.
310 |
311 | If this is the first batch, i.e. timestep == 0, do nothing.
312 | """
313 | if self.timestep > 0:
314 | self.prior_params['mu_beta'] = self.posterior_params['mu_beta']
315 | self.prior_params['sigma_beta'] = self.posterior_params['sigma_beta'] + self.sigma_drift
316 |
317 | def _update_posterior_params(self):
318 | """ Fits a normal distribution to the current beta samples using moment matching.
319 | """
320 | beta_samples = self.get_current_posterior_samples()
321 | self.posterior_params['mu_beta'] = beta_samples.mean().item()
322 | self.posterior_params['sigma_beta'] = beta_samples.std().item()
323 |
324 | def get_current_posterior_samples(self):
325 | """ Returns the current posterior samples for beta.
326 | """
327 | if self.mcmc is None:
328 | return None
329 |
330 | posterior_samples = self.mcmc.get_samples()['beta']
331 |
332 | return posterior_samples
333 |
334 | def calibrate(self, probs):
335 | """ Calibrates the given batch of logits using the current posterior samples.
336 |
337 | Args:
338 | logit: tensor ; shape (batch_size, num_classes)
339 | """
340 | # Get beta samples
341 | beta_samples = self.get_current_posterior_samples() # Shape (num_samples, num_classes)
342 | n_samples = beta_samples.size()[0]
343 |
344 | # Map betas to temperatures
345 | temperature_samples = torch.exp(beta_samples) # Shape (num_samples, )
346 |
347 | # Shape (n_inputs, n_cls, n_mcmc_samples)
348 | tempered_probs = torch.empty((probs.shape[0], probs.shape[1], n_samples))
349 | for i, temperature in enumerate(temperature_samples):
350 | tempered_probs[:, :, i] = probs ** (1. / temperature)
351 | tempered_probs[:, :, i] /= torch.sum(tempered_probs[:, :, i], dim=1, keepdim=True)
352 |
353 | # Average over the sampled probabilities to get Monte Carlo estimate
354 | calibrated_probs = tempered_probs.mean(dim=-1) # Shape (batch_size, num_classes)
355 | calibrated_probs = np.asarray(calibrated_probs)
356 | return calibrated_probs
357 |
358 | def get_MAP_temperature(self, logits, labels):
359 | """ Performs MAP estimation using the current prior and given data.
360 | NB: This should only be called after .update() if used in a sequential setting, as this method
361 | does not update the prior with sigma_drift.
362 |
363 | See: https://pyro.ai/examples/mle_map.html
364 | """
365 | pyro.clear_param_store()
366 | svi = pyro.infer.SVI(model=bt_model, guide=MAP_guide,
367 | optim=pyro.optim.Adam({'lr': 0.001}), loss=pyro.infer.Trace_ELBO())
368 |
369 | loss = []
370 | num_steps = 5000
371 | for _ in range(num_steps):
372 | loss.append(svi.step(self.prior_params, logits, labels))
373 |
374 | eps = 2e-2
375 | loss_sddev = np.std(loss[-25:])
376 | if loss_sddev > eps:
377 | warnings.warn('MAP optimization may not have converged ; sddev {}'.format(loss_sddev))
378 | print('Here is the last few loss terms for inspection: \n', loss[-50:])
379 |
380 | MAP_temperature = torch.exp(pyro.param('beta_MAP')).item()
381 | return MAP_temperature
382 |
383 |
384 | def bt_model(prior_params, logits, labels):
385 | """ Helper function for fully Bayesian temperature scaling.
386 | """
387 | n_obs = logits.shape[0] # Batch size
388 |
389 | # Prior over global temperature Beta ~ N( beta_mu, beta_sigma^2 )
390 | prior_beta_mu = prior_params['mu_beta']
391 | prior_beta_sigma = prior_params['sigma_beta']
392 | beta = pyro.sample('beta', dist.Normal(prior_beta_mu, prior_beta_sigma)) # Shape (1, )
393 |
394 | probs = softmax(torch.exp(-1. * beta) * logits, dim=1) # Shape (n_obs, n_classes) ; tempered probabilities
395 |
396 | # Observation plate ; vectorized
397 | with pyro.plate('obs', size=n_obs):
398 | a = pyro.sample('cat_obs', dist.Categorical(probs=probs), obs=labels)
399 |
400 |
401 | def MAP_guide(prior_params, logits, labels):
402 | """ Helper function for fully Bayesian temperature scaling.
403 | """
404 | beta_MAP = pyro.param('beta_MAP', torch.tensor(1., requires_grad=True))
405 | pyro.sample('beta', dist.Delta(beta_MAP))
406 |
--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from combination_methods import *
3 | import numpy as np
4 | import pandas as pd
5 |
6 | rng = np.random.default_rng(1234)
7 |
8 |
9 | def load_CIFAR10H(model_name):
10 | """ Loads the CIFAR-10H predictions (human and model) and true labels.
11 | """
12 | dirname = os.path.dirname(__file__)
13 | if model_name == 'r_low_acc':
14 | data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
15 | data = np.genfromtxt(data_path, delimiter=',')
16 |
17 | human_counts = data[:, :10]
18 | model_probs = data[:, 10:20]
19 | true_labels = data[:, -1]
20 |
21 | true_labels -= 1 # data has labels 1-10 -- shifting so that they are zero-indexed.
22 | else:
23 | data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
24 | data = np.genfromtxt(data_path, delimiter=',')
25 |
26 | true_labels = data[:, 0]
27 | human_counts = data[:, 1:11]
28 | model_probs = data[:, 11:]
29 |
30 | true_labels = true_labels.astype(int)
31 |
32 | return human_counts, model_probs, true_labels
33 |
34 |
35 | def load_CIFAR10H_individual(model_name):
36 | """ Loads the CIFAR-10H predictions, but keeps track of the individual IDs.
37 | """
38 | dirname = os.path.dirname(__file__)
39 |
40 | # ---- Model data
41 | if model_name == 'r_low_acc':
42 | data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
43 | data = np.genfromtxt(data_path, delimiter=',')
44 | model_probs = data[:, 10:20]
45 | else:
46 | data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
47 | data = np.genfromtxt(data_path, delimiter=',')
48 | model_probs = data[:, 11:]
49 |
50 | # ----- Human data
51 | # Load raw human-generated labels
52 | human_data_path = os.path.join(dirname, 'data/cifar10h/cifar10h-raw.csv')
53 | human_data = pd.read_csv(human_data_path)
54 | # drop attention checks
55 | human_data = human_data[human_data['is_attn_check'] == 0]
56 | # create dict which maps annotator_id --> [y_h, y_true, model_probs]
57 | # consisting of the individual's guesses, true labels, and model probabilities
58 | # (for the images labeled by the person)
59 | annotator_ids = human_data['annotator_id'].unique()
60 | n_annotators = annotator_ids.size
61 | individual_level_data = dict.fromkeys(np.arange(n_annotators))
62 | for i, idx in enumerate(annotator_ids):
63 | single_annotator_data = human_data[human_data['annotator_id'] == idx]
64 | image_idxs = single_annotator_data['cifar10_test_test_idx'].to_numpy()
65 | model_probs_single_annotator = model_probs[image_idxs]
66 |
67 | individual_level_data[i] = {'y_h': single_annotator_data['chosen_label'].to_numpy(),
68 | 'model_probs': model_probs_single_annotator,
69 | 'y_m': np.argmax(model_probs_single_annotator, axis=1),
70 | 'y_true': single_annotator_data['true_label'].to_numpy(),
71 |
72 | }
73 |
74 | return individual_level_data
75 |
76 |
77 | def load_old_noisy_imagenet(noise_level, model_name, n_epochs=None, noise_type='phase', reaction_time=False):
78 | """
79 | """
80 | assert noise_type in ['phase'], 'Invalid noise type'
81 | assert noise_level in [80, 95, 110, 125], 'Invalid noise level'
82 | assert model_name in ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'], 'Invalid model name'
83 |
84 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair',
85 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck']
86 | image_labels_numeric = np.arange(0, 16)
87 |
88 | data_dir = './data/old_noisy_imagenet_data'
89 | # data_dir = '../data/old_noisy_imagenet_data'
90 | human_data_fpath = data_dir + '/human_only_classification_6per_img_export.csv'
91 | model_data_prefix = '/imagenet_0016_category_phase_noise_all_predictions_'
92 | epoch_suffix = {None: 'baseline.csv',
93 | 0: 'epoch00.csv',
94 | 1: 'epoch01.csv',
95 | 10: 'epoch10.csv'}
96 | model_data_fpath = data_dir + model_data_prefix + epoch_suffix[n_epochs]
97 |
98 | human_data = pd.read_csv(human_data_fpath)
99 | human_data.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True)
100 | human_data = human_data[human_data['noise_level'] == noise_level]
101 | columns = ['participant_id', 'image_name', 'image_category', 'participant_classification']
102 | if reaction_time:
103 | columns.append('classification_time')
104 | human_data = human_data[columns]
105 | if reaction_time:
106 | arbitrary_reaction_time_cutoff = 650 # 650 ms
107 | human_data = human_data[human_data.classification_time > arbitrary_reaction_time_cutoff]
108 |
109 | model_data = pd.read_csv(model_data_fpath)
110 | model_data.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True)
111 | model_data = model_data[(model_data['noise_type'] == noise_type) &
112 | (model_data['noise_level'] == noise_level) &
113 | (model_data['model_name'] == model_name)]
114 | model_data.drop(columns=['noise_type', 'noise_level', 'model_name', 'correct', 'category'], inplace=True)
115 |
116 | # Merge based on input image
117 | dataset = pd.merge(human_data, model_data, on='image_name', how='left')
118 |
119 | # Map to numpy
120 | human_ids = dataset['participant_id'].to_numpy(dtype=int)
121 | y_h = dataset['participant_classification'].to_numpy(dtype=int)
122 | y_true = dataset['image_category'].to_numpy(dtype=int)
123 | # Get model_probs in numeric order, i.e. model_probs[0][0] corresponds to 'airplane'
124 | model_probs = dataset[image_labels].to_numpy(dtype=float)
125 | # Normalize each model output (via summing)
126 | model_probs /= model_probs.sum(axis=1, keepdims=True)
127 |
128 | if reaction_time:
129 | reaction_time = dataset['classification_time'].to_numpy(dtype=float)
130 | return y_true, y_h, model_probs, reaction_time
131 | return y_true, y_h, model_probs
132 |
133 |
134 | def load_noisy_imagenet(model_name, noise_level, model_acc_level):
135 | model_acc_level = model_acc_level.lower()
136 | assert model_acc_level in ['low', 'med', 'high']
137 | assert noise_level in [80, 95, 110, 125]
138 | if model_name == 'vgg19':
139 | model_name_dict = {'low': 'vgg19_01',
140 | 'med': 'vgg19_06',
141 | 'high': 'vgg19_48'}
142 | elif model_name == 'googlenet':
143 | model_name_dict = {'low': 'googlenet_01',
144 | 'med': 'googlenet_06',
145 | 'high': 'googlenet_47'}
146 | else:
147 | raise NotImplementedError
148 |
149 | # Read data CSVs
150 | data_path_model = f'./data/noisy_imagenet/{model_name}.csv'
151 | data_path_human = './data/noisy_imagenet/human_only_classification_6per_img_export.csv'
152 | data_model = pd.read_csv(data_path_model)
153 | data_human = pd.read_csv(data_path_human)
154 |
155 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair',
156 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck']
157 | image_labels_numeric = np.arange(0, 16)
158 |
159 | # Replace string labels with numeric labels and get appropriate subset of data
160 | data_human.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True)
161 | data_human = data_human[data_human['noise_level'] == noise_level]
162 | data_human = data_human[['image_name', 'image_category', 'participant_classification']]
163 |
164 | # Get appropriate subset of model data
165 | data_model.rename(columns=dict(zip(image_labels, image_labels_numeric)), inplace=True)
166 | data_model = data_model[(data_model['noise_level'] == noise_level) &
167 | (data_model['model_name'] == model_name_dict[model_acc_level])]
168 | data_model = data_model[['image_name'] + image_labels_numeric.tolist()]
169 |
170 | dataset = pd.merge(data_human, data_model, how='left')
171 | dataset.drop(columns=['image_name'], inplace=True)
172 |
173 | y_true = dataset['image_category'].to_numpy().astype(int)
174 | y_h = dataset['participant_classification'].to_numpy().astype(int)
175 | model_probs = dataset[image_labels_numeric].to_numpy()
176 |
177 | return y_true, y_h, model_probs
178 |
179 |
180 | def load_noisy_imagenet_logits(noise_level, model_acc_level):
181 | model_acc_level = model_acc_level.lower()
182 | assert model_acc_level in ['low', 'med', 'high']
183 | assert noise_level in [80, 95, 110, 125]
184 | model_name_dict = {'low': 'vgg19_01',
185 | 'med': 'vgg19_06',
186 | 'high': 'vgg19_48'}
187 |
188 | # Read data CSVs
189 | data_path_model = './data/noisy_imagenet/vgg19_logits.csv'
190 | data_path_human = './data/noisy_imagenet/human_only_classification_6per_img_export.csv'
191 | data_model = pd.read_csv(data_path_model)
192 | data_human = pd.read_csv(data_path_human)
193 |
194 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair',
195 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck']
196 | image_labels_numeric = np.arange(0, 16)
197 |
198 | # Replace string labels with numeric labels and get appropriate subset of data
199 | data_human.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True)
200 | data_human = data_human[data_human['noise_level'] == noise_level]
201 | data_human = data_human[['image_name', 'image_category', 'participant_classification']]
202 |
203 | # Get appropriate subset of model data
204 | data_model.rename(columns=dict(zip(image_labels, image_labels_numeric)), inplace=True)
205 | data_model = data_model[(data_model['noise_level'] == noise_level) &
206 | (data_model['model_name'] == model_name_dict[model_acc_level])]
207 | data_model = data_model[['image_name'] + image_labels_numeric.tolist()]
208 |
209 | dataset = pd.merge(data_human, data_model, how='left')
210 | dataset.drop(columns=['image_name'], inplace=True)
211 |
212 | y_true = dataset['image_category'].to_numpy().astype(int)
213 | y_h = dataset['participant_classification'].to_numpy().astype(int)
214 | model_probs = dataset[image_labels_numeric].to_numpy()
215 |
216 | return y_true, y_h, model_probs
217 |
218 |
219 | def load_noisy_imagenet_individual(model_name, noise_level, model_acc_level):
220 | dirname = os.path.dirname(__file__)
221 |
222 | model_acc_level = model_acc_level.lower()
223 | assert model_acc_level in ['low', 'med', 'high']
224 | assert noise_level in [80, 95, 110, 125]
225 | if model_name == 'vgg19':
226 | model_name_dict = {'low': 'vgg19_01',
227 | 'med': 'vgg19_06',
228 | 'high': 'vgg19_48'}
229 | elif model_name == 'googlenet':
230 | model_name_dict = {'low': 'googlenet_01',
231 | 'med': 'googlenet_06',
232 | 'high': 'googlenet_47'}
233 | else:
234 | raise NotImplementedError
235 |
236 | # Read data CSVs
237 | data_path_model = os.path.join(dirname, f'./data/noisy_imagenet/{model_name}.csv')
238 | data_path_human = os.path.join(dirname, './data/noisy_imagenet/human_only_classification_6per_img_export.csv')
239 | data_model = pd.read_csv(data_path_model)
240 | data_human = pd.read_csv(data_path_human)
241 |
242 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair',
243 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck']
244 | image_labels_numeric = np.arange(0, 16)
245 |
246 | # Replace string labels with numeric labels and get appropriate subset of data
247 | data_human.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True)
248 | data_human = data_human[data_human['noise_level'] == noise_level]
249 |
250 | # Get appropriate subset of model data
251 | data_model.rename(columns=dict(zip(image_labels, image_labels_numeric)), inplace=True)
252 | data_model = data_model[(data_model['noise_level'] == noise_level) &
253 | (data_model['model_name'] == model_name_dict[model_acc_level])]
254 | data_model = data_model[['image_name'] + image_labels_numeric.tolist()]
255 |
256 | # ----- Human data
257 | # Load raw human-generated labels
258 | # create dict which maps annotator_id --> [y_h, y_true, model_probs]
259 | # consisting of the individual's guesses, true labels, and model probabilities
260 | # (for the images labeled by the person)
261 | annotator_ids = data_human['participant_id'].unique()
262 | n_annotators = annotator_ids.size
263 | individual_level_data = dict.fromkeys(np.arange(n_annotators))
264 | for i, idx in enumerate(annotator_ids):
265 | single_annotator_data = data_human[data_human['participant_id'] == idx]
266 | image_idxs = single_annotator_data['image_name'].to_numpy()
267 |
268 | # Iterate over images human labeled, get model predictions
269 | model_probs_single_annotator = np.empty(shape=(image_idxs.size, 16))
270 | for j, image_idx in enumerate(image_idxs):
271 | model_probs_single_image = data_model[data_model['image_name'] == image_idx]
272 | model_probs_single_image = model_probs_single_image[image_labels_numeric].to_numpy()[0]
273 | model_probs_single_annotator[j] = model_probs_single_image
274 |
275 | individual_level_data[i] = {'y_h': single_annotator_data['participant_classification'].to_numpy(),
276 | 'model_probs': model_probs_single_annotator,
277 | 'y_m': np.argmax(model_probs_single_annotator, axis=1),
278 | 'y_true': single_annotator_data['image_category'].to_numpy(),
279 | }
280 |
281 | return individual_level_data
282 |
--------------------------------------------------------------------------------
/ensemble_ts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | @author: Jize Zhang
5 | See : https://github.com/zhang64-llnl/Mix-n-Match-Calibration/blob/master/util_calibration.py
6 | """
7 |
8 | import numpy as np
9 | from scipy import optimize
10 | from sklearn.isotonic import IsotonicRegression
11 |
12 | """
13 | auxiliary functions for optimizing the temperature (scaling approaches) and weights of ensembles
14 | *args include logits and labels from the calibration dataset:
15 | """
16 |
17 |
18 | def mse_t(t, *args):
19 | ## find optimal temperature with MSE loss function
20 |
21 | logit, label = args
22 | logit = logit / t
23 | n = np.sum(np.exp(logit), 1)
24 | p = np.exp(logit) / n[:, None]
25 | mse = np.mean((p - label) ** 2)
26 | return mse
27 |
28 |
29 | def ll_t(t, *args):
30 | ## find optimal temperature with Cross-Entropy loss function
31 |
32 | logit, label = args
33 | logit = logit / t
34 | n = np.sum(np.exp(logit), 1)
35 | p = np.clip(np.exp(logit) / n[:, None], 1e-20, 1 - 1e-20)
36 | N = p.shape[0]
37 | ce = -np.sum(label * np.log(p)) / N
38 | return ce
39 |
40 |
41 | def mse_w(w, *args):
42 | ## find optimal weight coefficients with MSE loss function
43 |
44 | p0, p1, p2, label = args
45 | p = w[0] * p0 + w[1] * p1 + w[2] * p2
46 | p = p / np.sum(p, 1)[:, None]
47 | mse = np.mean((p - label) ** 2)
48 | return mse
49 |
50 |
51 | def ll_w(w, *args):
52 | ## find optimal weight coefficients with Cros-Entropy loss function
53 |
54 | p0, p1, p2, label = args
55 | p = (w[0] * p0 + w[1] * p1 + w[2] * p2)
56 | N = p.shape[0]
57 | ce = -np.sum(label * np.log(p)) / N
58 | return ce
59 |
60 |
61 | ##### Ftting Temperature Scaling
62 | def temperature_scaling(logit, label, loss):
63 | bnds = ((0.05, 5.0),)
64 | if loss == 'ce':
65 | t = optimize.minimize(ll_t, 1.0, args=(logit, label), method='L-BFGS-B', bounds=bnds, tol=1e-12,
66 | options={'disp': False})
67 | if loss == 'mse':
68 | t = optimize.minimize(mse_t, 1.0, args=(logit, label), method='L-BFGS-B', bounds=bnds, tol=1e-12,
69 | options={'disp': False})
70 | t = t.x
71 | return t
72 |
73 |
74 | ##### Ftting Enseble Temperature Scaling
75 | def ensemble_scaling(logit, label, loss, t, n_class):
76 | p1 = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None]
77 | logit = logit / t
78 | p0 = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None]
79 | p2 = np.ones_like(p0) / n_class
80 |
81 | bnds_w = ((0.0, 1.0), (0.0, 1.0), (0.0, 1.0),)
82 |
83 | def my_constraint_fun(x):
84 | return np.sum(x) - 1
85 |
86 | constraints = {"type": "eq", "fun": my_constraint_fun, }
87 | if loss == 'ce':
88 | w = optimize.minimize(ll_w, (1.0, 0.0, 0.0), args=(p0, p1, p2, label), method='SLSQP', constraints=constraints,
89 | bounds=bnds_w, tol=1e-12, options={'disp': False})
90 | if loss == 'mse':
91 | w = optimize.minimize(mse_w, (1.0, 0.0, 0.0), args=(p0, p1, p2, label), method='SLSQP', constraints=constraints,
92 | bounds=bnds_w, tol=1e-12, options={'disp': False})
93 | w = w.x
94 | return w
95 |
96 |
97 | """
98 | Calibration:
99 | Input: uncalibrated logits, temperature (and weight)
100 | Output: calibrated prediction probabilities
101 | """
102 |
103 |
104 | ##### Calibration: Temperature Scaling with MSE
105 | def ts_calibrate(logit, label, logit_eval, loss):
106 | t = temperature_scaling(logit, label, loss)
107 | print("temperature = " + str(t))
108 | logit_eval = logit_eval / t
109 | p = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None]
110 | return p
111 |
112 |
113 | ##### Calibration: Ensemble Temperature Scaling
114 | def ets_calibrate(logit, label, n_class, loss='mse'):
115 | t = temperature_scaling(logit, label, loss='mse') # loss can change to 'ce'
116 | #print("temperature = " + str(t))
117 | w = ensemble_scaling(logit, label, 'mse', t, n_class)
118 | #print("weight = " + str(w))
119 |
120 | return t, w
121 |
122 | """
123 | p1 = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None]
124 | logit_eval = logit_eval / t
125 | p0 = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None]
126 | p2 = np.ones_like(p0) / n_class
127 | p = w[0] * p0 + w[1] * p1 + w[2] * p2
128 | return p
129 | """
130 |
131 | ##### Calibration: Isotonic Regression (Multi-class)
132 | def mir_calibrate(logit, label, logit_eval):
133 | p = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None]
134 | p_eval = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None]
135 | ir = IsotonicRegression(out_of_bounds='clip')
136 | y_ = ir.fit_transform(p.flatten(), (label.flatten()))
137 | yt_ = ir.predict(p_eval.flatten())
138 |
139 | p = yt_.reshape(logit_eval.shape) + 1e-9 * p_eval
140 | return p
141 |
142 |
143 | def irova_calibrate(logit, label, logit_eval):
144 | p = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None]
145 | p_eval = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None]
146 |
147 | for ii in range(p_eval.shape[1]):
148 | ir = IsotonicRegression(out_of_bounds='clip')
149 | y_ = ir.fit_transform(p[:, ii], label[:, ii])
150 | p_eval[:, ii] = ir.predict(p_eval[:, ii]) + 1e-9 * p_eval[:, ii]
151 | return p_eval
152 | return p_eval
--------------------------------------------------------------------------------
/experiments/ablation_experiment_v2.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from utils import *
6 | from combination_methods import *
7 | from tqdm.auto import tqdm
8 | from sklearn.model_selection import train_test_split
9 | import calibration as cal
10 | import csv
11 | import numpy as np
12 | import os
13 | from imax_calib.evaluations import calibration_metrics as cal_metrics
14 | from scipy.special import softmax
15 |
16 | # note: this experiment does not appear in our paper and may contain outdated code.
17 |
18 |
19 | def get_cw_ECE(probs, y_true):
20 | evals = cal_metrics.compute_top_1_and_CW_ECEs(probs, y_true, list_approximators=['mECE'])
21 | return evals['cw_mECE']
22 |
23 |
24 | def run_experiment_cifar10(out_fpath=None):
25 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on CIFAR10
26 | """
27 | assert out_fpath is not None, 'Must specify output filepath'
28 | model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40']
29 | test_size = 0.2
30 | n_runs = 25
31 |
32 | for model_name in tqdm(model_names, desc='Models', leave=True):
33 | output_file = out_fpath + f'{model_name}_ablation.csv'
34 | assert not os.path.exists(output_file), 'Output filepath already exists'
35 | # Create CSV output file, write header
36 | with open(output_file, 'a', newline='') as f:
37 | writer = csv.writer(f)
38 | writer.writerow(['trial', 'acc_h', 'acc_m',
39 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_nocal', 'acc_comb_doubleconf',
40 | 'ce_m', 'ce_m_TS', 'ce_m_ETS',
41 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_nocal', 'ce_combo_doubleconf'])
42 |
43 | human_counts, model_probs, y_true = load_CIFAR10H(model_name)
44 | y_h = simulate_single_human(human_counts)
45 | for i in tqdm(range(n_runs), leave=False, desc='Runs'):
46 | # Train/test split
47 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
48 | y_h, model_probs, y_true, test_size=test_size, random_state=i)
49 |
50 | # ----- Calibrator: temperature scaling
51 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling')
52 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr)
53 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te)
54 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te)
55 |
56 | # ----- Calibrator: ensemble temperature scaling
57 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling')
58 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr)
59 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te)
60 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te)
61 |
62 | # ----- Calibrator: None
63 | oracle_combiner_nocal = OracleCombiner(calibration_method=None)
64 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr)
65 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te)
66 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te)
67 |
68 | # ----- Only estimate model's confusion matrix
69 | double_conf_combiner = DoubleConfusionCombiner()
70 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
71 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te)
72 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te)
73 |
74 | # ----- Evaluate accuracies
75 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te)
76 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te)
77 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te)
78 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te)
79 | acc_h_te = np.mean(y_h_te == y_true_te)
80 | y_m_te = np.argmax(model_probs_te, axis=1)
81 | acc_m_te = np.mean(y_m_te == y_true_te)
82 |
83 | # ----- Evaluate calibration
84 | # NB: This is the marginal L1 CE (debiase)
85 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te,
86 | p=1, debias=False, mode='marginal')
87 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te,
88 | p=1, debias=False, mode='marginal')
89 | ce_m_ETS = cal.get_calibration_error(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te,
90 | p=1, debias=False, mode='marginal')
91 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te,
92 | p=1, debias=False, mode='marginal')
93 | ce_combo_ETS = cal.get_calibration_error(y_comb_prob_ETS, y_true_te,
94 | p=1, debias=False, mode='marginal')
95 | ce_combo_nocal = cal.get_calibration_error(y_comb_prob_nocal, y_true_te,
96 | p=1, debias=False, mode='marginal')
97 | ce_combo_doubleconf = cal.get_calibration_error(y_comb_prob_doubleconf, y_true_te,
98 | p=1, debias=False, mode='marginal')
99 |
100 | # Write results to CSV
101 | with open(output_file, 'a', newline='') as f:
102 | writer = csv.writer(f)
103 | writer.writerow([i, acc_h_te, acc_m_te,
104 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_oracle_nocal, acc_comb_doubleconf,
105 | ce_m_te, ce_m_TS, ce_m_ETS,
106 | ce_combo_TS, ce_combo_ETS, ce_combo_nocal, ce_combo_doubleconf])
107 |
108 |
109 | def run_experiment_noisy_imagenet(out_fpath=None):
110 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet
111 | """
112 | assert out_fpath is not None, 'Must specify output filepath'
113 | model_acc_levels = ['low', 'med', 'high']
114 | noise_levels = [80, 95, 110, 125]
115 | test_size = 0.2
116 | n_runs = 25
117 |
118 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True):
119 | for noise_level in tqdm(noise_levels, desc='Noise Levels'):
120 | output_file = out_fpath + f'vgg19{model_level}_noise{noise_level}_ablation.csv'
121 | assert not os.path.exists(output_file), 'Output filepath already exists'
122 | # Create CSV output file, write header
123 | with open(output_file, 'a', newline='') as f:
124 | writer = csv.writer(f)
125 | writer.writerow(['trial', 'acc_h', 'acc_m',
126 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_imax_CW', 'acc_comb_imax_sCW',
127 | 'acc_comb_nocal', 'acc_comb_doubleconf',
128 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 'ce_m_imax_CW', 'ce_m_imax_sCW',
129 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_imax_CW', 'ce_combo_imax_sCW',
130 | 'ce_combo_nocal', 'ce_combo_doubleconf'])
131 |
132 | y_true, y_h, model_probs = load_noisy_imagenet(noise_level, model_level)
133 | for i in tqdm(range(n_runs), leave=False, desc='Runs'):
134 | # Train/test split
135 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
136 | y_h, model_probs, y_true, test_size=test_size, random_state=i)
137 |
138 | # ----- Calibrator: temperature scaling
139 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling')
140 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr)
141 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te)
142 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te)
143 |
144 | # ----- Calibrator: ensemble temperature scaling
145 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling')
146 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr)
147 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te)
148 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te)
149 |
150 | # ----- Calibrator: imax binning (CW)
151 | oracle_combiner_imax_CW = OracleCombiner(calibration_method='imax binning', mode='CW')
152 | oracle_combiner_imax_CW.fit(model_probs_tr, y_h_tr, y_true_tr)
153 | y_comb_prob_imax_CW = oracle_combiner_imax_CW.combine_proba(model_probs_te, y_h_te)
154 | y_comb_imax_CW = oracle_combiner_imax_CW.combine(model_probs_te, y_h_te)
155 |
156 | # ----- Calibrator: imax binning (sCW)
157 | oracle_combiner_imax_sCW = OracleCombiner(calibration_method='imax binning', mode='sCW')
158 | oracle_combiner_imax_sCW.fit(model_probs_tr, y_h_tr, y_true_tr)
159 | y_comb_prob_imax_sCW = oracle_combiner_imax_sCW.combine_proba(model_probs_te, y_h_te)
160 | y_comb_imax_sCW = oracle_combiner_imax_sCW.combine(model_probs_te, y_h_te)
161 |
162 | # ----- Calibrator: None
163 | oracle_combiner_nocal = OracleCombiner(calibration_method=None)
164 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr)
165 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te)
166 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te)
167 |
168 | # ----- Only estimate model's confusion matrix
169 | double_conf_combiner = DoubleConfusionCombiner()
170 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
171 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te)
172 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te)
173 |
174 | # ----- Evaluate accuracies
175 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te)
176 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te)
177 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te)
178 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te)
179 | acc_comb_imax_CW = np.mean(y_comb_imax_CW == y_true_te)
180 | acc_comb_imax_sCW = np.mean(y_comb_imax_sCW == y_true_te)
181 | acc_h_te = np.mean(y_h_te == y_true_te)
182 | y_m_te = np.argmax(model_probs_te, axis=1)
183 | acc_m_te = np.mean(y_m_te == y_true_te)
184 |
185 | # ----- Evaluate calibration
186 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te,
187 | p=1, debias=False, mode='marginal')
188 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te,
189 | p=1, debias=False, mode='marginal')
190 | ce_m_ETS = cal.get_calibration_error(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te,
191 | p=1, debias=False, mode='marginal')
192 | ce_m_imax_CW = cal.get_calibration_error(oracle_combiner_imax_CW.calibrate(model_probs_te), y_true_te,
193 | p=1, debias=False, mode='marginal')
194 | ce_m_imax_sCW = cal.get_calibration_error(oracle_combiner_imax_sCW.calibrate(model_probs_te), y_true_te,
195 | p=1, debias=False, mode='marginal')
196 | # ----- Of combination
197 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te,
198 | p=1, debias=False, mode='marginal')
199 | ce_combo_ETS = cal.get_calibration_error(y_comb_prob_ETS, y_true_te,
200 | p=1, debias=False, mode='marginal')
201 | ce_combo_nocal = cal.get_calibration_error(y_comb_prob_nocal, y_true_te,
202 | p=1, debias=False, mode='marginal')
203 | ce_combo_doubleconf = cal.get_calibration_error(y_comb_prob_doubleconf, y_true_te,
204 | p=1, debias=False, mode='marginal')
205 | ce_combo_imax_CW = cal.get_calibration_error(y_comb_prob_imax_CW, y_true_te,
206 | p=1, debias=False, mode='marginal')
207 | ce_combo_imax_sCW = cal.get_calibration_error(y_comb_prob_imax_sCW, y_true_te,
208 | p=1, debias=False, mode='marginal')
209 |
210 | # Write results to CSV
211 | with open(output_file, 'a', newline='') as f:
212 | writer = csv.writer(f)
213 | writer.writerow([i, acc_h_te, acc_m_te,
214 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_imax_CW, acc_comb_imax_sCW,
215 | acc_comb_oracle_nocal, acc_comb_doubleconf,
216 | ce_m_te, ce_m_TS, ce_m_ETS, ce_m_imax_CW, ce_m_imax_sCW,
217 | ce_combo_TS, ce_combo_ETS, ce_combo_imax_CW, ce_combo_imax_sCW,
218 | ce_combo_nocal, ce_combo_doubleconf])
219 |
220 |
221 | def run_experiment_noisy_imagenet_logit(out_fpath=None):
222 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on Noisy ImageNet
223 | """
224 | assert out_fpath is not None, 'Must specify output filepath'
225 | model_acc_levels = ['low', 'med', 'high']
226 | noise_levels = [80, 95, 110, 125]
227 | test_size = 0.2
228 | n_runs = 25
229 |
230 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True):
231 | for noise_level in tqdm(noise_levels, desc='Noise Levels'):
232 | output_file = out_fpath + f'vgg19{model_level}_noise{noise_level}_ablation.csv'
233 | assert not os.path.exists(output_file), 'Output filepath already exists'
234 | # Create CSV output file, write header
235 | with open(output_file, 'a', newline='') as f:
236 | writer = csv.writer(f)
237 | writer.writerow(['trial', 'acc_h', 'acc_m',
238 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_imax_CW', 'acc_comb_imax_sCW',
239 | 'acc_comb_dirichlet', 'acc_comb_nocal', 'acc_comb_doubleconf',
240 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 'ce_m_imax_CW', 'ce_m_imax_sCW', 'ce_m_dirichlet',
241 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_imax_CW', 'ce_combo_imax_sCW',
242 | 'ce_combo_dirichlet', 'ce_combo_nocal', 'ce_combo_doubleconf'])
243 |
244 | y_true, y_h, model_logits = load_noisy_imagenet_logits(noise_level, model_level)
245 | model_probs = softmax(model_logits, axis=1)
246 | for i in tqdm(range(n_runs), leave=False, desc='Runs'):
247 | # Train/test split
248 | y_h_tr, y_h_te, model_logits_tr, model_logits_te, \
249 | model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
250 | y_h, model_logits, model_probs, y_true, test_size=test_size, random_state=i)
251 |
252 | # ----- Calibrator: temperature scaling
253 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling')
254 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr)
255 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te)
256 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te)
257 |
258 | # ----- Calibrator: ensemble temperature scaling
259 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling')
260 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr)
261 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te)
262 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te)
263 |
264 | # ----- Calibrator: imax binning (CW)
265 | oracle_combiner_imax_CW = OracleCombiner(calibration_method='imax binning', mode='CW', num_bins=20)
266 | oracle_combiner_imax_CW.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr)
267 | y_comb_prob_imax_CW = oracle_combiner_imax_CW.combine_proba(model_probs_te, y_h_te)
268 | y_comb_imax_CW = oracle_combiner_imax_CW.combine(model_probs_te, y_h_te)
269 |
270 | # ----- Calibrator: imax binning (sCW)
271 | oracle_combiner_imax_sCW = OracleCombiner(calibration_method='imax binning', mode='sCW', num_bins=20)
272 | oracle_combiner_imax_sCW.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr)
273 | y_comb_prob_imax_sCW = oracle_combiner_imax_sCW.combine_proba(model_probs_te, y_h_te)
274 | y_comb_imax_sCW = oracle_combiner_imax_sCW.combine(model_probs_te, y_h_te)
275 |
276 | # ----- Calibrator: Dirichlet
277 | oracle_combiner_dirichlet = OracleCombiner(calibration_method='dirichlet')
278 | oracle_combiner_dirichlet.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr)
279 | y_comb_prob_dirichlet = oracle_combiner_dirichlet.combine_proba(model_probs_te, y_h_te)
280 | y_comb_dirichlet = oracle_combiner_dirichlet.combine(model_probs_te, y_h_te)
281 |
282 | # ----- Calibrator: None
283 | oracle_combiner_nocal = OracleCombiner(calibration_method=None)
284 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr)
285 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te)
286 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te)
287 |
288 | # ----- Only estimate model's confusion matrix
289 | double_conf_combiner = DoubleConfusionCombiner()
290 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
291 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te)
292 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te)
293 |
294 | # ----- Evaluate accuracies
295 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te)
296 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te)
297 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te)
298 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te)
299 | acc_comb_imax_CW = np.mean(y_comb_imax_CW == y_true_te)
300 | acc_comb_imax_sCW = np.mean(y_comb_imax_sCW == y_true_te)
301 | acc_comb_dirichlet = np.mean(y_comb_dirichlet == y_true_te)
302 | acc_h_te = np.mean(y_h_te == y_true_te)
303 | y_m_te = np.argmax(model_probs_te, axis=1)
304 | acc_m_te = np.mean(y_m_te == y_true_te)
305 |
306 | # ----- Evaluate calibration
307 | # Model only
308 | ce_m = get_cw_ECE(model_probs_te, y_true_te)
309 | ce_m_TS = get_cw_ECE(oracle_combiner_TS.calibrate(model_probs_te), y_true_te)
310 | ce_m_ETS = get_cw_ECE(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te)
311 | ce_m_imax_CW = get_cw_ECE(oracle_combiner_imax_CW.calibrate(model_probs_te), y_true_te)
312 | ce_m_imax_sCW = get_cw_ECE(oracle_combiner_imax_sCW.calibrate(model_probs_te), y_true_te)
313 | ce_m_dirichlet = get_cw_ECE(oracle_combiner_dirichlet.calibrate(model_probs_te), y_true_te)
314 |
315 | # Combination
316 | ce_combo_doubleconf = get_cw_ECE(y_comb_prob_doubleconf, y_true_te)
317 | ce_combo_nocal = get_cw_ECE(y_comb_prob_nocal, y_true_te)
318 | ce_combo_TS = get_cw_ECE(y_comb_prob_TS, y_true_te)
319 | ce_combo_ETS = get_cw_ECE(y_comb_prob_ETS, y_true_te)
320 | ce_combo_imax_CW = get_cw_ECE(y_comb_prob_imax_CW, y_true_te)
321 | ce_combo_imax_sCW = get_cw_ECE(y_comb_prob_imax_sCW, y_true_te)
322 | ce_combo_dirichlet = get_cw_ECE(y_comb_prob_dirichlet, y_true_te)
323 |
324 | # Write results to CSV
325 | with open(output_file, 'a', newline='') as f:
326 | writer = csv.writer(f)
327 | writer.writerow([i, acc_h_te, acc_m_te,
328 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_imax_CW, acc_comb_imax_sCW,
329 | acc_comb_dirichlet, acc_comb_oracle_nocal, acc_comb_doubleconf,
330 | ce_m, ce_m_TS, ce_m_ETS, ce_m_imax_CW, ce_m_imax_sCW, ce_m_dirichlet,
331 | ce_combo_TS, ce_combo_ETS, ce_combo_imax_CW, ce_combo_imax_sCW,
332 | ce_combo_dirichlet, ce_combo_nocal, ce_combo_doubleconf])
333 |
334 |
335 | def run_experiment_noisy_imagenet(out_fpath=None):
336 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet
337 | """
338 | assert out_fpath is not None, 'Must specify output filepath'
339 | model_name = 'densenet161'
340 | model_acc_levels = [None, 0, 10]
341 | noise_levels = [80, 95, 110, 125]
342 | test_size = 0.2
343 | n_runs = 10
344 |
345 | for epochs in tqdm(model_acc_levels, desc='Models', leave=True):
346 | for noise_level in tqdm(noise_levels, desc='Noise Levels'):
347 | output_file = out_fpath + f'{model_name}_epoch{epochs}_noise{noise_level}_ablation.csv'
348 | # assert not os.path.exists(output_file), 'Output filepath already exists'
349 | # Create CSV output file, write header
350 | with open(output_file, 'a', newline='') as f:
351 | writer = csv.writer(f)
352 | writer.writerow(['trial', 'acc_h', 'acc_m',
353 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_imax_CW', 'acc_comb_imax_sCW',
354 | 'acc_comb_nocal', 'acc_comb_doubleconf',
355 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 'ce_m_imax_CW', 'ce_m_imax_sCW',
356 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_imax_CW', 'ce_combo_imax_sCW',
357 | 'ce_combo_nocal', 'ce_combo_doubleconf'])
358 |
359 | y_true, y_h, model_probs = load_old_noisy_imagenet_data(noise_level, model_name, n_epochs=epochs)
360 | for i in tqdm(range(n_runs), leave=False, desc='Runs'):
361 | # Train/test split
362 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
363 | y_h, model_probs, y_true, test_size=test_size, random_state=i)
364 |
365 | # ----- Calibrator: temperature scaling
366 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling')
367 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr)
368 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te)
369 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te)
370 |
371 | """
372 | print('\n \n')
373 | print(f' Combo acc {np.mean(y_comb_TS == y_true_te)}')
374 | print(f' human tr {np.mean(y_h_tr == y_true_tr)}')
375 | print(f' human te {np.mean(y_h_te == y_true_te)}')
376 | print(f' model tr {np.mean(np.argmax(model_probs_tr, axis=1) == y_true_tr)}')
377 | print(f' model te {np.mean(np.argmax(model_probs_te, axis=1) == y_true_te)}')
378 | print(f' model all {np.mean(np.argmax(model_probs, axis=1) == y_true)}')
379 | quit()
380 | """
381 |
382 | # ----- Calibrator: ensemble temperature scaling
383 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling')
384 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr)
385 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te)
386 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te)
387 |
388 | # ----- Calibrator: imax binning (CW)
389 | oracle_combiner_imax_CW = OracleCombiner(calibration_method='imax binning', mode='CW')
390 | oracle_combiner_imax_CW.fit(model_probs_tr, y_h_tr, y_true_tr)
391 | y_comb_prob_imax_CW = oracle_combiner_imax_CW.combine_proba(model_probs_te, y_h_te)
392 | y_comb_imax_CW = oracle_combiner_imax_CW.combine(model_probs_te, y_h_te)
393 |
394 | # ----- Calibrator: imax binning (sCW)
395 | oracle_combiner_imax_sCW = OracleCombiner(calibration_method='imax binning', mode='sCW')
396 | oracle_combiner_imax_sCW.fit(model_probs_tr, y_h_tr, y_true_tr)
397 | y_comb_prob_imax_sCW = oracle_combiner_imax_sCW.combine_proba(model_probs_te, y_h_te)
398 | y_comb_imax_sCW = oracle_combiner_imax_sCW.combine(model_probs_te, y_h_te)
399 |
400 | # ----- Calibrator: None
401 | oracle_combiner_nocal = OracleCombiner(calibration_method=None)
402 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr)
403 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te)
404 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te)
405 |
406 | # ----- Only estimate model's confusion matrix
407 | double_conf_combiner = DoubleConfusionCombiner()
408 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
409 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te)
410 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te)
411 |
412 | # ----- Evaluate accuracies
413 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te)
414 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te)
415 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te)
416 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te)
417 | acc_comb_imax_CW = np.mean(y_comb_imax_CW == y_true_te)
418 | acc_comb_imax_sCW = np.mean(y_comb_imax_sCW == y_true_te)
419 | acc_h_te = np.mean(y_h_te == y_true_te)
420 | y_m_te = np.argmax(model_probs_te, axis=1)
421 | acc_m_te = np.mean(y_m_te == y_true_te)
422 |
423 | # ----- Evaluate calibration
424 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te,
425 | p=1, debias=False, mode='marginal')
426 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te,
427 | p=1, debias=False, mode='marginal')
428 | ce_m_ETS = cal.get_calibration_error(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te,
429 | p=1, debias=False, mode='marginal')
430 | ce_m_imax_CW = cal.get_calibration_error(oracle_combiner_imax_CW.calibrate(model_probs_te), y_true_te,
431 | p=1, debias=False, mode='marginal')
432 | ce_m_imax_sCW = cal.get_calibration_error(oracle_combiner_imax_sCW.calibrate(model_probs_te), y_true_te,
433 | p=1, debias=False, mode='marginal')
434 | # ----- Of combination
435 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te,
436 | p=1, debias=False, mode='marginal')
437 | ce_combo_ETS = cal.get_calibration_error(y_comb_prob_ETS, y_true_te,
438 | p=1, debias=False, mode='marginal')
439 | ce_combo_nocal = cal.get_calibration_error(y_comb_prob_nocal, y_true_te,
440 | p=1, debias=False, mode='marginal')
441 | ce_combo_doubleconf = cal.get_calibration_error(y_comb_prob_doubleconf, y_true_te,
442 | p=1, debias=False, mode='marginal')
443 | ce_combo_imax_CW = cal.get_calibration_error(y_comb_prob_imax_CW, y_true_te,
444 | p=1, debias=False, mode='marginal')
445 | ce_combo_imax_sCW = cal.get_calibration_error(y_comb_prob_imax_sCW, y_true_te,
446 | p=1, debias=False, mode='marginal')
447 |
448 | # Write results to CSV
449 | with open(output_file, 'a', newline='') as f:
450 | writer = csv.writer(f)
451 | writer.writerow([i, acc_h_te, acc_m_te,
452 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_imax_CW, acc_comb_imax_sCW,
453 | acc_comb_oracle_nocal, acc_comb_doubleconf,
454 | ce_m_te, ce_m_TS, ce_m_ETS, ce_m_imax_CW, ce_m_imax_sCW,
455 | ce_combo_TS, ce_combo_ETS, ce_combo_imax_CW, ce_combo_imax_sCW,
456 | ce_combo_nocal, ce_combo_doubleconf])
457 |
458 |
459 | if __name__ == '__main__':
460 | """
461 | out_fpath = './output/cifar10h/'
462 | run_experiment_cifar10(out_fpath)
463 | """
464 |
465 | out_fpath = './output/'
466 | run_experiment_noisy_imagenet(out_fpath)
467 |
--------------------------------------------------------------------------------
/experiments/calibrate_combo_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from utils import *
6 | from combination_methods import *
7 | from tqdm.auto import tqdm
8 | import torch
9 | from sklearn.model_selection import train_test_split
10 | from metrics import *
11 | import csv
12 | import numpy as np
13 | import os
14 | from calibrators import *
15 |
16 | # Generates the data for Table 2 (and Appendix D) in our paper.
17 |
18 |
19 | def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
20 | seed = kwargs.pop('seed', 0)
21 | n_runs = kwargs.pop('n_runs', 25)
22 | test_size = kwargs.pop('test_size', 0.3)
23 | calibration_methods = kwargs.pop('calibration_methods', ['none'])
24 | calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
25 | output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
26 | output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')
27 |
28 | acc_data = []
29 | cal_data = []
30 | for i in tqdm(range(n_runs), leave=False, desc='Runs'):
31 | # Train/test split
32 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
33 | y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)
34 |
35 | # Limit to 5k datapoints
36 | y_h_tr = y_h_tr[:5000]
37 | model_probs_tr = model_probs_tr[:5000, :]
38 | y_true_tr = y_true_tr[:5000]
39 |
40 | acc_h = get_acc(y_h_te, y_true_te)
41 | acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)
42 |
43 | _acc_data = [acc_h, acc_m]
44 | _cal_data = []
45 | DIAG_ACC = 0.75
46 | MU_BETA = 0.5
47 | SIGMA_BETA = 0.5
48 | combiners = {'MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA),
49 | 'uncal_MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA)}
50 | for combiner_name, combiner in combiners.items():
51 | combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
52 | if combiner_name == 'uncal_MAP_CI':
53 | combiner.calibrator.temperature = 1 # pretty hacky way to get uncalibrated temps.. but w/e
54 |
55 | y_comb_te = combiner.combine(model_probs_te, y_h_te)
56 | acc_comb = get_acc(y_comb_te, y_true_te)
57 | _acc_data.append(acc_comb)
58 |
59 | model_probs_calibrated_te = combiner.calibrate(model_probs_te)
60 | y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)
61 |
62 | # ----- Calibrate combination
63 | ts_calibrator = TSCalibratorMAP()
64 | comb_probs_tr = combiner.combine_proba(model_probs_tr, y_h_tr)
65 | comb_logits_tr = np.log(np.clip(comb_probs_tr, 1e-50, 1))
66 | ts_calibrator.fit(comb_logits_tr, y_true_tr)
67 | y_comb_prob_te_calibrated = ts_calibrator.calibrate(y_comb_prob_te)
68 |
69 | for metric, fxn in calibration_metrics.items():
70 | cal_m = fxn(model_probs_calibrated_te, y_true_te)
71 | cal_comb = fxn(y_comb_prob_te, y_true_te)
72 | cal_comb_calibrated = fxn(y_comb_prob_te_calibrated, y_true_te)
73 | _cal_data.append([combiner_name, metric, cal_m, cal_comb, cal_comb_calibrated])
74 |
75 | acc_data += [_acc_data]
76 | cal_data += _cal_data
77 |
78 | # Save data to CSV
79 | header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
80 | with open(output_file_acc, 'w', newline='') as f:
81 | writer = csv.writer(f)
82 | writer.writerow(header_acc)
83 | writer.writerows(acc_data)
84 | header_cal = ['calibration method', 'metric', 'model', 'comb', 'comb (post cal)']
85 | with open(output_file_calibration, 'w', newline='') as f:
86 | writer = csv.writer(f)
87 | writer.writerow(header_cal)
88 | writer.writerows(cal_data)
89 |
90 |
91 | def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
92 | model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40']
93 | for model_name in tqdm(model_names, desc='Models', leave=True):
94 | # Specify output files
95 | output_file_acc = out_fpath + f'{model_name}_accuracy.csv'
96 | output_file_calibration = out_fpath + f'{model_name}_calibration.csv'
97 | assert not os.path.exists(output_file_acc), 'Output filepath already exists'
98 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
99 | experiment_args['output_file_acc'] = output_file_acc
100 | experiment_args['output_file_calibration'] = output_file_calibration
101 |
102 | # Load data
103 | human_counts, model_probs, y_true = load_CIFAR10H(model_name)
104 | y_h = simulate_single_human(human_counts, seed=seed)
105 |
106 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)
107 |
108 |
109 | def run_experiment_noisy_imagenet(out_fpath=None, experiment_args=None, seed=0):
110 | model_acc_levels = ['high'] # ['low', 'med', 'high']
111 | noise_levels = [80, 95, 110, 125]
112 | model_names = ['vgg19', 'googlenet']
113 |
114 | for model_name in model_names:
115 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True):
116 | for noise_level in tqdm(noise_levels, desc='Noise Levels'):
117 | # Specify output files
118 | output_file_acc = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_accuracy.csv'
119 | output_file_calibration = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_calibration.csv'
120 | assert not os.path.exists(output_file_acc), 'Output filepath already exists'
121 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
122 | experiment_args['output_file_acc'] = output_file_acc
123 | experiment_args['output_file_calibration'] = output_file_calibration
124 |
125 | # Load data
126 | y_true, y_h, model_probs = load_noisy_imagenet(model_name, noise_level, model_level)
127 |
128 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)
129 |
130 |
131 | if __name__ == '__main__':
132 | seed = 9658
133 | torch.manual_seed(seed)
134 | np.random.seed(seed)
135 |
136 | calibration_methods = ['none', 'confusion', 'temperature scaling']
137 | """
138 | calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
139 | 'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
140 | 'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
141 | 'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
142 | 'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
143 | threshold_mode=None),
144 | 'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
145 | threshold_mode=None),
146 | 'kumar MCE': get_MCE,
147 | 'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
148 | p=1, debias=False, mode='marginal'),
149 | 'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
150 | p=1, debias=False,
151 | mode='marginal'),
152 | 'kumar ECE': cal.get_ece}
153 | """
154 | calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
155 | 'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
156 | 'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
157 | 'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
158 | 'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
159 | 'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
160 | 'NLL': get_NLL}
161 |
162 | args = {'n_runs': 25,
163 | 'test_size': 0.3,
164 | 'calibration_methods': calibration_methods,
165 | 'calibration_metrics': calibration_metrics,
166 | 'seed': seed
167 | }
168 |
169 | #out_fpath = './output/cifar10h/final/calibrate_comb_MAP/'
170 | #run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)
171 |
172 | out_fpath = './output/noisy_imagenet/final/calibrate_comb_MAP/'
173 | run_experiment_noisy_imagenet(out_fpath=out_fpath, experiment_args=args, seed=seed)
174 |
--------------------------------------------------------------------------------
/experiments/calibrate_first_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from combination_methods import *
6 | from tqdm.auto import tqdm
7 | from sklearn.model_selection import train_test_split
8 | import calibration as cal
9 | import csv
10 | import os.path
11 |
12 | # note: this experiment does not appear in our paper.
13 |
14 |
15 | def run_experiment(out_fpath=None):
16 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy imagenet
17 |
18 | Done in a semi-supervised fashion.
19 | """
20 | assert out_fpath is not None, 'Must specify output filepath'
21 | # Create CSV output file if needed, write header
22 | with open(out_fpath, 'a', newline='') as f:
23 | writer = csv.writer(f)
24 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 'n_l', 'n_u',
25 | 'acc_h_te', 'acc_m_te',
26 | 'acc_comb_oracle_te', 'acc_comb_unsup_te', 'acc_comb_semisup_te',
27 | 'acc_comb_calibfirst_te',
28 | 'ece_m_te', 'ece_m_calibrated_oracle_te', 'ece_m_calibrated_unsup_te',
29 | 'ece_m_calibrated_semisup_te', 'ece_m_calibrated_calibfirst_te',
30 | 'ece_combo_oracle_te', 'ece_combo_unsup_te', 'ece_combo_semisup_te',
31 | 'ece_combo_calibfirst_te'])
32 |
33 | # Experiment parameters
34 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19']
35 | model_names = ['densenet161']
36 | noise_levels = [80, 95, 110, 125]
37 | # epochs = [None, 0, 1, 10]
38 | epochs = [None, 0, 10]
39 | noise_type = 'phase'
40 | n_runs = 10
41 |
42 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True):
43 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'):
44 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'):
45 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet_data(noise_level, model_name,
46 | n_epochs=epoch, noise_type=noise_type)
47 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'):
48 | # Train/test split 70/30
49 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
50 | y_h, model_probs, y_true, test_size=0.3, random_state=i)
51 |
52 | n_tr = y_h_tr.size # Number of training points
53 |
54 | # Evaluate accuracies of things that don't change with n_l
55 | acc_h_te = np.mean(y_h_te == y_true_te)
56 | y_m_te = np.argmax(model_probs_te, axis=1)
57 | acc_m_te = np.mean(y_m_te == y_true_te)
58 | # Evaluate calibration of things that don't change with n_l
59 | ece_m_te = cal.get_ece(model_probs_te, y_true_te)
60 |
61 | # ----- Unsupervised EM, all unlabeled data
62 | # Edge case with n_l = 0
63 | unsupervised_EM_combiner = UnsupervisedEMCombiner()
64 | unsupervised_EM_combiner.fit(model_probs_tr, y_h_tr)
65 | y_comb_prob_unsup_te = unsupervised_EM_combiner.combine_proba(model_probs_te, y_h_te)
66 | y_comb_unsup_te = unsupervised_EM_combiner.combine(model_probs_te, y_h_te)
67 |
68 | acc_combo_unsup_all_te = np.mean(y_comb_unsup_te == y_true_te)
69 | ece_m_calibrated_unsup_all_te = cal.get_ece(unsupervised_EM_combiner.calibrate(model_probs_te),
70 | y_true_te)
71 | ece_combo_unsup_all_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te)
72 |
73 | with open(out_fpath, 'a', newline='') as f:
74 | writer = csv.writer(f)
75 | writer.writerow([model_name, noise_level, epoch, i, 0, n_tr,
76 | acc_h_te, acc_m_te,
77 | None, acc_combo_unsup_all_te, None,
78 | ece_m_te, None, ece_m_calibrated_unsup_all_te,
79 | None,
80 | None, ece_combo_unsup_all_te, None])
81 |
82 | # ----- Fully supervised (oracle) combo
83 | # Edge case with n_l = all
84 | oracle_combiner = OracleCombiner()
85 | oracle_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
86 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te)
87 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te)
88 |
89 | acc_combo_oracle_all_te = np.mean(y_comb_oracle_te == y_true_te)
90 | ece_m_calibrated_oracle_all_te = cal.get_ece(oracle_combiner.calibrate(model_probs_te),
91 | y_true_te)
92 | ece_combo_oracle_all_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te)
93 |
94 | with open(out_fpath, 'a', newline='') as f:
95 | writer = csv.writer(f)
96 | writer.writerow([model_name, noise_level, epoch, i, n_tr, 0,
97 | acc_h_te, acc_m_te,
98 | None, acc_combo_oracle_all_te, None,
99 | ece_m_te, None, ece_m_calibrated_oracle_all_te,
100 | None,
101 | None, ece_combo_oracle_all_te, None])
102 |
103 | # TODO : Do these edge cases really need to be separated out??
104 |
105 | n_l_sizes = [10, 50, 100, 250, 500, 1000, 2500, 4500] # Amount of labeled data to use
106 | for n_l in tqdm(n_l_sizes, leave=False, desc='Num. Labels'):
107 | # Split into labeled / unlabeled datasets
108 | n_u = n_tr - n_l
109 | y_h_tr_u, y_h_tr_l = y_h_tr[n_l:], y_h_tr[:n_l]
110 | model_probs_tr_u, model_probs_tr_l = model_probs_tr[n_l:], model_probs_tr[:n_l]
111 | y_true_tr_l = y_true_tr[:n_l]
112 |
113 | # ----- Labeled data only
114 | oracle_combiner = OracleCombiner()
115 | oracle_combiner.fit(model_probs_tr_l, y_h_tr_l, y_true_tr_l)
116 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te)
117 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te)
118 | model_probs_calibrated_oracle_te = oracle_combiner.calibrate(model_probs_te)
119 |
120 | # ----- Semi-Supervised EM
121 | # Fit parameters, calibrate test set, combine test set
122 | semisup_combiner = SemiSupervisedEMCombiner()
123 | semisup_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l)
124 | y_comb_prob_semisup_te = semisup_combiner.combine_proba(model_probs_te, y_h_te)
125 | y_comb_semisup_te = semisup_combiner.combine(model_probs_te, y_h_te)
126 | model_probs_calibrated_semisup_te = semisup_combiner.calibrate(model_probs_te)
127 |
128 | # ----- Unsupervised EM
129 | unsup_combiner = UnsupervisedEMCombiner()
130 | unsup_combiner.fit(model_probs_tr_u, y_h_tr_u)
131 | y_comb_prob_unsup_te = unsup_combiner.combine_proba(model_probs_te, y_h_te)
132 | y_comb_unsup_te = unsup_combiner.combine(model_probs_te, y_h_te)
133 | model_probs_calibrated_unsup_te = unsup_combiner.calibrate(model_probs_te)
134 |
135 | # ----- Calibrate first, then fit confusion using semi-supervised EM
136 | calibrate_first_combiner = CalibrateFirstCombiner()
137 | calibrate_first_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l)
138 | y_comb_prob_calibfirst_te = calibrate_first_combiner.combine_proba(model_probs_te, y_h_te)
139 | y_comb_calibfirst_te = calibrate_first_combiner.combine(model_probs_te, y_h_te)
140 | model_probs_calibrated_calibfirst_te = calibrate_first_combiner.calibrate(model_probs_te)
141 |
142 | # ----- Evaluate accuracies
143 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te)
144 | acc_comb_semisup_te = np.mean(y_comb_semisup_te == y_true_te)
145 | acc_comb_unsup_te = np.mean(y_comb_unsup_te == y_true_te)
146 | acc_comb_calibfirst_te = np.mean(y_comb_calibfirst_te == y_true_te)
147 |
148 | # ----- Evaluate calibration
149 | # Evaluate ECE of just model
150 | ece_m_calibrated_oracle_te = cal.get_ece(model_probs_calibrated_oracle_te, y_true_te)
151 | ece_m_calibrated_unsup_te = cal.get_ece(model_probs_calibrated_unsup_te, y_true_te)
152 | ece_m_calibrated_semisup_te = cal.get_ece(model_probs_calibrated_semisup_te, y_true_te)
153 | ece_m_calibrated_calibfirst_te = cal.get_ece(model_probs_calibrated_calibfirst_te, y_true_te)
154 |
155 | # Evaluate ECE of combination
156 | ece_combo_oracle_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te)
157 | ece_combo_unsup_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te)
158 | ece_combo_semisup_te = cal.get_ece(y_comb_prob_semisup_te, y_true_te)
159 | ece_combo_calibfirst_te = cal.get_ece(y_comb_prob_calibfirst_te, y_true_te)
160 |
161 | # Write results to CSV
162 | with open(out_fpath, 'a', newline='') as f:
163 | writer = csv.writer(f)
164 | writer.writerow([model_name, noise_level, epoch, i, n_l, n_u,
165 | acc_h_te, acc_m_te,
166 | acc_comb_oracle_te, acc_comb_unsup_te, acc_comb_semisup_te,
167 | acc_comb_calibfirst_te,
168 | ece_m_te, ece_m_calibrated_oracle_te, ece_m_calibrated_unsup_te,
169 | ece_m_calibrated_semisup_te, ece_m_calibrated_calibfirst_te,
170 | ece_combo_oracle_te, ece_combo_unsup_te, ece_combo_semisup_te,
171 | ece_combo_calibfirst_te])
172 |
173 |
174 | if __name__ == '__main__':
175 | out_fpath = './output/'
176 | if os.path.exists(out_fpath):
177 | print('Output filepath exists, dont overwrite it!')
178 | quit()
179 | run_experiment(out_fpath)
180 |
--------------------------------------------------------------------------------
/experiments/calibration_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from utils import *
6 | from combination_methods import *
7 | from tqdm.auto import tqdm
8 | import torch
9 | from sklearn.model_selection import train_test_split
10 | from metrics import *
11 | import csv
12 | import numpy as np
13 | import os
14 |
15 |
16 | # Generates the data for Appendix C in our paper.
17 |
18 | def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
19 | seed = kwargs.pop('seed', 0)
20 | n_runs = kwargs.pop('n_runs', 25)
21 | test_size = kwargs.pop('test_size', 0.3)
22 | calibration_methods = kwargs.pop('calibration_methods', ['none'])
23 | calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
24 | output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
25 | output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')
26 |
27 | acc_data = []
28 | cal_data = []
29 | for i in tqdm(range(n_runs), leave=False, desc='Runs'):
30 | # Train/test split
31 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
32 | y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)
33 |
34 | acc_h = get_acc(y_h_te, y_true_te)
35 | acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)
36 |
37 | _acc_data = [acc_h, acc_m]
38 | _cal_data = []
39 | for calibration_method in calibration_methods:
40 | if calibration_method == 'confusion':
41 | combiner = DoubleConfusionCombiner()
42 | combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
43 | else:
44 | combiner = OracleCombiner(calibration_method=calibration_method)
45 | combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
46 |
47 | y_comb_te = combiner.combine(model_probs_te, y_h_te)
48 | acc_comb = get_acc(y_comb_te, y_true_te)
49 | _acc_data.append(acc_comb)
50 |
51 | model_probs_calibrated_te = combiner.calibrate(model_probs_te)
52 | y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)
53 | for metric, fxn in calibration_metrics.items():
54 | cal_m = fxn(model_probs_calibrated_te, y_true_te)
55 | cal_comb = fxn(y_comb_prob_te, y_true_te)
56 | _cal_data.append([calibration_method, metric, cal_m, cal_comb])
57 |
58 | acc_data += [_acc_data]
59 | cal_data += _cal_data
60 |
61 | # Save data to CSV
62 | header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
63 | with open(output_file_acc, 'w', newline='') as f:
64 | writer = csv.writer(f)
65 | writer.writerow(header_acc)
66 | writer.writerows(acc_data)
67 | header_cal = ['calibration method', 'metric', 'model', 'comb']
68 | with open(output_file_calibration, 'w', newline='') as f:
69 | writer = csv.writer(f)
70 | writer.writerow(header_cal)
71 | writer.writerows(cal_data)
72 |
73 |
74 | def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
75 | model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40']
76 | for model_name in tqdm(model_names, desc='Models', leave=True):
77 | # Specify output files
78 | output_file_acc = out_fpath + f'{model_name}_accuracy.csv'
79 | output_file_calibration = out_fpath + f'{model_name}_calibration.csv'
80 | assert not os.path.exists(output_file_acc), 'Output filepath already exists'
81 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
82 | experiment_args['output_file_acc'] = output_file_acc
83 | experiment_args['output_file_calibration'] = output_file_calibration
84 |
85 | # Load data
86 | human_counts, model_probs, y_true = load_CIFAR10H(model_name)
87 | y_h = simulate_single_human(human_counts, seed=seed)
88 |
89 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)
90 |
91 |
92 | def run_experiment_noisy_imagenet(out_fpath=None, experiment_args=None, seed=0):
93 | model_acc_levels = ['high'] # ['low', 'med', 'high']
94 | noise_levels = [80, 95, 110, 125]
95 | model_names = ['vgg19', 'googlenet']
96 |
97 | for model_name in model_names:
98 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True):
99 | for noise_level in tqdm(noise_levels, desc='Noise Levels'):
100 | # Specify output files
101 | output_file_acc = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_accuracy.csv'
102 | output_file_calibration = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_calibration.csv'
103 | assert not os.path.exists(output_file_acc), 'Output filepath already exists'
104 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
105 | experiment_args['output_file_acc'] = output_file_acc
106 | experiment_args['output_file_calibration'] = output_file_calibration
107 |
108 | # Load data
109 | y_true, y_h, model_probs = load_noisy_imagenet(model_name, noise_level, model_level)
110 |
111 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)
112 |
113 |
114 | if __name__ == '__main__':
115 | seed = 9658
116 | torch.manual_seed(seed)
117 | np.random.seed(seed)
118 |
119 | calibration_methods = ['none', 'confusion', 'temperature scaling', 'ensemble temperature scaling', 'imax binning']
120 | """
121 | calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
122 | 'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
123 | 'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
124 | 'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
125 | 'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
126 | threshold_mode=None),
127 | 'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
128 | threshold_mode=None),
129 | 'kumar MCE': get_MCE,
130 | 'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
131 | p=1, debias=False, mode='marginal'),
132 | 'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
133 | p=1, debias=False,
134 | mode='marginal'),
135 | 'kumar ECE': cal.get_ece}
136 | """
137 | calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
138 | 'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
139 | 'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
140 | 'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
141 | 'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
142 | 'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
143 | 'NLL': get_NLL}
144 |
145 | args = {'n_runs': 25,
146 | 'test_size': 0.3,
147 | 'calibration_methods': calibration_methods,
148 | 'calibration_metrics': calibration_metrics,
149 | 'seed': seed
150 | }
151 |
152 | out_fpath = './output/cifar10h/final/fully_sup_CI/'
153 | run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)
154 |
155 | out_fpath = './output/noisy_imagenet/final/fully_sup_CI/'
156 | run_experiment_noisy_imagenet(out_fpath=out_fpath, experiment_args=args, seed=seed)
157 |
--------------------------------------------------------------------------------
/experiments/calibration_method_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from combination_methods import *
6 | from tqdm.auto import tqdm
7 | from sklearn.model_selection import train_test_split
8 | import calibration as cal
9 | import csv
10 | import os.path
11 |
12 | # note: this experiment does not appear in our paper.
13 |
14 | def run_experiment(out_fpath=None):
15 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet
16 | """
17 | assert out_fpath is not None, 'Must specify output filepath'
18 | # Create CSV output file if needed, write header
19 | with open(out_fpath, 'a', newline='') as f:
20 | writer = csv.writer(f)
21 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial',
22 | 'acc_h', 'acc_m', 'acc_combo_TS', 'acc_combo_dir',
23 | 'ce_m', 'ce_m_TS', 'ce_m_dir', 'ce_combo_TS', 'ce_combo_dir',
24 | 'ece_m_te', 'ece_m_TS', 'ece_m_dir', 'ece_combo_TS', 'ece_combo_dir'])
25 |
26 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19']
27 | model_names = ['densenet161']
28 | noise_levels = [80, 95, 110, 125]
29 | # epochs = [None, 0, 1, 10]
30 | epochs = [None, 0, 10]
31 | noise_type = 'phase'
32 | n_runs = 5
33 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True):
34 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'):
35 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'):
36 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'):
37 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet_data(noise_level, model_name,
38 | n_epochs=epoch, noise_type=noise_type)
39 | # Train/test split 70/30
40 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
41 | y_h, model_probs, y_true, test_size=0.3, random_state=i)
42 |
43 | # ----- Calibrator: temperature scaling
44 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling')
45 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr)
46 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te)
47 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te)
48 |
49 | # ----- Calibrator: Dirichlet calibration
50 | oracle_combiner_dirichlet = OracleCombiner(calibration_method='dirichlet')
51 | oracle_combiner_dirichlet.fit(model_probs_tr, y_h_tr, y_true_tr)
52 | y_comb_prob_dirichlet = oracle_combiner_dirichlet.combine_proba(model_probs_te, y_h_te)
53 | y_comb_dirichlet = oracle_combiner_dirichlet.combine(model_probs_te, y_h_te)
54 |
55 | # ----- Evaluate accuracies
56 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te)
57 | acc_comb_oracle_dirichlet = np.mean(y_comb_dirichlet == y_true_te)
58 | acc_h_te = np.mean(y_h_te == y_true_te)
59 | y_m_te = np.argmax(model_probs_te, axis=1)
60 | acc_m_te = np.mean(y_m_te == y_true_te)
61 |
62 | # ----- Evaluate calibration
63 | # NB: This is the \ell_2, debiased, marginal calibration error
64 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te,
65 | p=2, debias=True, mode='marginal')
66 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te,
67 | p=2, debias=True, mode='marginal')
68 | ce_m_dirichlet = cal.get_calibration_error(oracle_combiner_dirichlet.calibrate(model_probs_te),
69 | y_true_te,
70 | p=2, debias=True, mode='marginal')
71 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te,
72 | p=2, debias=True, mode='marginal')
73 | ce_combo_dirichlet = cal.get_calibration_error(y_comb_prob_dirichlet, y_true_te,
74 | p=2, debias=True, mode='marginal')
75 |
76 | # NB: This is the usual ECE
77 | ece_m_te = cal.get_ece(model_probs_te, y_true_te)
78 | ece_m_TS = cal.get_ece(oracle_combiner_TS.calibrate(model_probs_te), y_true_te)
79 | ece_m_dirichlet = cal.get_ece(oracle_combiner_dirichlet.calibrate(model_probs_te),
80 | y_true_te)
81 | ece_combo_TS = cal.get_ece(y_comb_prob_TS, y_true_te)
82 | ece_combo_dirichlet = cal.get_ece(y_comb_prob_dirichlet, y_true_te)
83 |
84 | # Write results to CSV
85 | with open(out_fpath, 'a', newline='') as f:
86 | # TODO : Save model name, noise level, epoch
87 | writer = csv.writer(f)
88 | writer.writerow([model_name, noise_level, epoch, i,
89 | acc_h_te, acc_m_te, acc_comb_oracle_TS, acc_comb_oracle_dirichlet,
90 | ce_m_te, ce_m_TS, ce_m_dirichlet, ce_combo_TS, ce_combo_dirichlet,
91 | ece_m_te, ece_m_TS, ece_m_dirichlet, ece_combo_TS, ece_combo_dirichlet])
92 |
93 |
94 | if __name__ == '__main__':
95 | out_fpath = './output/'
96 | run_experiment(out_fpath)
97 |
--------------------------------------------------------------------------------
/experiments/em_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | import torch
6 | from calibrators import *
7 | from tqdm.auto import tqdm
8 | from sklearn.model_selection import train_test_split
9 | from sklearn.metrics import confusion_matrix
10 | import calibration as cal
11 | from combination_methods import *
12 | import csv
13 |
14 | # note: this experiment does not appear in our paper and may contain outdated code.
15 |
16 | """
17 | def em_combo_experiment():
18 | # TODO : This code is old and needs updated for CIFAR-10H experiments
19 | # Calibrates and estimates confusion on held-out train set
20 | # Combines on test set and evaluates
21 |
22 | human_counts, model_probs, true_labels = load_CIFAR10H()
23 | true_labels = true_labels.astype(int)
24 | model_logits = np.log(model_probs)
25 | # Simulate a single human labeler
26 | y_h = get_human_labels_outcomes(human_counts, true_labels)[0][:, 0].astype(int)
27 | n = y_h.size
28 |
29 | temp, conf_h = fit_EM(model_probs, y_h)
30 | calibrated_model_probs = calibrate_probs_TS(model_probs, temp)
31 | y_comb = combine(calibrated_model_probs, y_h, conf_h)
32 |
33 | return np.mean(y_comb == true_labels)
34 | """
35 |
36 |
37 | def run_experiment(out_fpath=None):
38 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on Noisy ImageNet
39 | """
40 | assert out_fpath is not None, 'Must specify output filepath'
41 | # Create CSV output file if needed, write header
42 | with open(out_fpath, 'a', newline='') as f:
43 | writer = csv.writer(f)
44 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial',
45 | 'acc_h_te', 'acc_m_te', 'acc_comb_oracle_te', 'acc_comb_te',
46 | 'ce_m_te', 'ce_m_calibrated_te', 'ce_combo_te', 'ce_oracle_combo_te', 'ce_m_oracle_te',
47 | 'ece_m_te',
48 | 'ece_m_calibrated_te', 'ece_combo_te', 'ece_oracle_combo_te', 'ece_m_oracle_te',
49 | 'frobenius_distance_conf_te'])
50 |
51 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19']
52 | model_names = ['densenet161']
53 | noise_levels = [80, 95, 110, 125]
54 | # epochs = [None, 0, 1, 10]
55 | epochs = [None, 0, 10]
56 | noise_type = 'phase'
57 | n_runs = 5
58 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True):
59 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'):
60 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'):
61 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'):
62 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet(noise_level, model_name,
63 | n_epochs=epoch, noise_type=noise_type)
64 | # Train/test split 70/30
65 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
66 | y_h, model_probs, y_true, test_size=0.3, random_state=i)
67 |
68 | # ----- 'Orancle' Experiment
69 | y_comb_oracle_soft_te = oracle_combo(y_h_tr, model_probs_tr, y_true_tr, model_probs_te, y_h_te)
70 | y_comb_oracle_te = np.argmax(y_comb_oracle_soft_te, axis=1)
71 |
72 | temp_oracle = temperature_scaling(torch.from_numpy(np.log(model_probs_tr)),
73 | torch.from_numpy(y_true_tr))['temperature'].item()
74 | calibrated_probs_te_oracle = calibrate_probs_TS(model_probs_te, temp_oracle)
75 |
76 | # ----- EM Experiment
77 | # Fit EM parameters on train set
78 | calibrator, conf_h = fit_EM(model_probs_tr, y_h_tr) # TODO: Different calibration methods
79 | # Calibrate predictions on test set
80 | # TODO: Different calibration methods
81 | model_probs_calibrated_te = calibrator.calibrate(model_probs_te)
82 | # Combine calibrated model predictions with human labels on test set
83 | y_comb_te_soft = combine(model_probs_calibrated_te, y_h_te, conf_h)
84 | y_comb_te = np.argmax(y_comb_te_soft, axis=1)
85 |
86 | # ----- Evaluate accuracies
87 | acc_comb_te = np.mean(y_comb_te == y_true_te)
88 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te)
89 | acc_h_te = np.mean(y_h_te == y_true_te)
90 | y_m_te = np.argmax(model_probs_te, axis=1)
91 | acc_m_te = np.mean(y_m_te == y_true_te)
92 |
93 | # ----- Evaluate calibration
94 | # NB: This is the \ell_2, debiased, marginal calibration error
95 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te,
96 | p=2, debias=True, mode='marginal')
97 | ce_m_calibrated_te = cal.get_calibration_error(model_probs_calibrated_te, y_true_te,
98 | p=2, debias=True, mode='marginal')
99 | ce_combo_te = cal.get_calibration_error(y_comb_te_soft, y_true_te,
100 | p=2, debias=True, mode='marginal')
101 | ce_oracle_combo_te = cal.get_calibration_error(y_comb_oracle_soft_te, y_true_te,
102 | p=2, debias=True, mode='marginal')
103 | ce_m_oracle_te = cal.get_calibration_error(calibrated_probs_te_oracle, y_true_te,
104 | p=2, debias=True, mode='marginal')
105 |
106 | # NB: This is the usual ECE
107 | ece_m_te = cal.get_ece(model_probs_te, y_true_te)
108 | ece_m_calibrated_te = cal.get_ece(model_probs_calibrated_te, y_true_te)
109 | ece_combo_te = cal.get_ece(y_comb_te_soft, y_true_te)
110 | ece_oracle_combo_te = cal.get_ece(y_comb_oracle_soft_te, y_true_te)
111 | ece_m_oracle_te = cal.get_ece(calibrated_probs_te_oracle, y_true_te)
112 |
113 | # Evaluate confusion matrix
114 | # Entry [i,j] is P(h = i | Y = j)
115 | conf_h_te = confusion_matrix(y_true_te, y_h_te, normalize='pred').T
116 | # Computes the Frobenius-norm (RMSE) distance between:
117 | # (i) human confusion matrix estimated via EM
118 | # (ii) human confusion matrix directly estimated via ground-truth on test set
119 | frobenius_distance_conf_te = np.linalg.norm(conf_h_te - conf_h)
120 |
121 | # Write results to CSV
122 | with open(out_fpath, 'a', newline='') as f:
123 | # TODO : Save model name, noise level, epoch
124 | writer = csv.writer(f)
125 | writer.writerow([model_name, noise_level, epoch, i,
126 | acc_h_te, acc_m_te, acc_comb_oracle_te, acc_comb_te,
127 | ce_m_te, ce_m_calibrated_te, ce_combo_te, ce_oracle_combo_te, ce_m_oracle_te,
128 | ece_m_te, ece_m_calibrated_te, ece_combo_te, ece_oracle_combo_te,
129 | ece_m_oracle_te,
130 | frobenius_distance_conf_te])
131 |
132 |
133 | if __name__ == '__main__':
134 | out_fpath = './output/'
135 | run_experiment(out_fpath)
136 |
--------------------------------------------------------------------------------
/experiments/semisup_em_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from combination_methods import *
6 | from tqdm.auto import tqdm
7 | from sklearn.model_selection import train_test_split
8 | import calibration as cal
9 | import csv
10 | import os.path
11 |
12 | # note: these experiments do not appear in our paper and may contain outdated code.
13 |
14 |
15 | def run_experiment(out_fpath=None):
16 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet.
17 |
18 | Done in a semi-supervised fashion.
19 | """
20 | assert out_fpath is not None, 'Must specify output filepath'
21 | # Create CSV output file if needed, write header
22 | with open(out_fpath, 'a', newline='') as f:
23 | writer = csv.writer(f)
24 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 'n_l', 'n_u',
25 | 'acc_h_te', 'acc_m_te',
26 | 'acc_comb_oracle_te', 'acc_comb_unsup_te', 'acc_comb_semisup_te',
27 | 'ece_m_te', 'ece_m_calibrated_oracle_te', 'ece_m_calibrated_unsup_te',
28 | 'ece_m_calibrated_semisup_te',
29 | 'ece_combo_oracle_te', 'ece_combo_unsup_te', 'ece_combo_semisup_te'])
30 |
31 | # Experiment parameters
32 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19']
33 | model_names = ['densenet161']
34 | noise_levels = [80, 95, 110, 125]
35 | # epochs = [None, 0, 1, 10]
36 | epochs = [None, 0, 10]
37 | noise_type = 'phase'
38 | n_runs = 25
39 |
40 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True):
41 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'):
42 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'):
43 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet(noise_level, model_name,
44 | n_epochs=epoch, noise_type=noise_type)
45 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'):
46 | # Train/test split 70/30
47 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
48 | y_h, model_probs, y_true, test_size=0.3, random_state=i)
49 |
50 | n_tr = y_h_tr.size # Number of training points
51 |
52 | # Evaluate accuracies of things that don't change with n_l
53 | acc_h_te = np.mean(y_h_te == y_true_te)
54 | y_m_te = np.argmax(model_probs_te, axis=1)
55 | acc_m_te = np.mean(y_m_te == y_true_te)
56 | # Evaluate calibration of things that don't change with n_l
57 | ece_m_te = cal.get_ece(model_probs_te, y_true_te)
58 |
59 | # ----- Unsupervised EM, all unlabeled data
60 | # Edge case with n_l = 0
61 | unsupervised_EM_combiner = UnsupervisedEMCombiner()
62 | unsupervised_EM_combiner.fit(model_probs_tr, y_h_tr)
63 | y_comb_prob_unsup_te = unsupervised_EM_combiner.combine_proba(model_probs_te, y_h_te)
64 | y_comb_unsup_te = unsupervised_EM_combiner.combine(model_probs_te, y_h_te)
65 |
66 | acc_combo_unsup_all_te = np.mean(y_comb_unsup_te == y_true_te)
67 | ece_m_calibrated_unsup_all_te = cal.get_ece(unsupervised_EM_combiner.calibrate(model_probs_te),
68 | y_true_te)
69 | ece_combo_unsup_all_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te)
70 |
71 | with open(out_fpath, 'a', newline='') as f:
72 | writer = csv.writer(f)
73 | writer.writerow([model_name, noise_level, epoch, i, 0, n_tr,
74 | acc_h_te, acc_m_te,
75 | None, acc_combo_unsup_all_te, None,
76 | ece_m_te, None, ece_m_calibrated_unsup_all_te,
77 | None,
78 | None, ece_combo_unsup_all_te, None])
79 |
80 | # ----- Fully supervised (oracle) combo
81 | # Edge case with n_l = all
82 | oracle_combiner = OracleCombiner()
83 | oracle_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
84 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te)
85 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te)
86 |
87 | acc_combo_oracle_all_te = np.mean(y_comb_oracle_te == y_true_te)
88 | ece_m_calibrated_oracle_all_te = cal.get_ece(oracle_combiner.calibrate(model_probs_te),
89 | y_true_te)
90 | ece_combo_oracle_all_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te)
91 |
92 | with open(out_fpath, 'a', newline='') as f:
93 | writer = csv.writer(f)
94 | writer.writerow([model_name, noise_level, epoch, i, n_tr, 0,
95 | acc_h_te, acc_m_te,
96 | None, acc_combo_oracle_all_te, None,
97 | ece_m_te, None, ece_m_calibrated_oracle_all_te,
98 | None,
99 | None, ece_combo_oracle_all_te, None])
100 |
101 | # TODO : Do these edge cases really need to be separated out??
102 |
103 | n_l_sizes = [10, 50, 100, 250, 500, 1000, 2500, 4500] # Amount of labeled data to use
104 | for n_l in tqdm(n_l_sizes, leave=False, desc='Num. Labels'):
105 | # Split into labeled / unlabeled datasets
106 | n_u = n_tr - n_l
107 | y_h_tr_u, y_h_tr_l = y_h_tr[n_l:], y_h_tr[:n_l]
108 | model_probs_tr_u, model_probs_tr_l = model_probs_tr[n_l:], model_probs_tr[:n_l]
109 | y_true_tr_l = y_true_tr[:n_l]
110 |
111 | # ----- Labeled data only
112 | oracle_combiner = OracleCombiner()
113 | oracle_combiner.fit(model_probs_tr_l, y_h_tr_l, y_true_tr_l)
114 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te)
115 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te)
116 | model_probs_calibrated_oracle_te = oracle_combiner.calibrate(model_probs_te)
117 |
118 | # ----- Semi-Supervised EM
119 | # Fit parameters, calibrate test set, combine test set
120 | semisup_combiner = SemiSupervisedEMCombiner()
121 | semisup_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l)
122 | y_comb_prob_semisup_te = semisup_combiner.combine_proba(model_probs_te, y_h_te)
123 | y_comb_semisup_te = semisup_combiner.combine(model_probs_te, y_h_te)
124 | model_probs_calibrated_semisup_te = semisup_combiner.calibrate(model_probs_te)
125 |
126 | # ----- Unsupervised EM
127 | unsup_combiner = UnsupervisedEMCombiner()
128 | unsup_combiner.fit(model_probs_tr_u, y_h_tr_u)
129 | y_comb_prob_unsup_te = unsup_combiner.combine_proba(model_probs_te, y_h_te)
130 | y_comb_unsup_te = unsup_combiner.combine(model_probs_te, y_h_te)
131 | model_probs_calibrated_unsup_te = unsup_combiner.calibrate(model_probs_te)
132 |
133 | # ----- Evaluate accuracies
134 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te)
135 | acc_comb_semisup_te = np.mean(y_comb_semisup_te == y_true_te)
136 | acc_comb_unsup_te = np.mean(y_comb_unsup_te == y_true_te)
137 |
138 | # ----- Evaluate calibration
139 | # Evaluate ECE of just model
140 | ece_m_calibrated_oracle_te = cal.get_ece(model_probs_calibrated_oracle_te, y_true_te)
141 | ece_m_calibrated_unsup_te = cal.get_ece(model_probs_calibrated_unsup_te, y_true_te)
142 | ece_m_calibrated_semisup_te = cal.get_ece(model_probs_calibrated_semisup_te, y_true_te)
143 |
144 | # Evaluate ECE of combination
145 | ece_combo_oracle_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te)
146 | ece_combo_unsup_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te)
147 | ece_combo_semisup_te = cal.get_ece(y_comb_prob_semisup_te, y_true_te)
148 |
149 | # Write results to CSV
150 | with open(out_fpath, 'a', newline='') as f:
151 | writer = csv.writer(f)
152 | writer.writerow([model_name, noise_level, epoch, i, n_l, n_u,
153 | acc_h_te, acc_m_te,
154 | acc_comb_oracle_te, acc_comb_unsup_te, acc_comb_semisup_te,
155 | ece_m_te, ece_m_calibrated_oracle_te, ece_m_calibrated_unsup_te,
156 | ece_m_calibrated_semisup_te,
157 | ece_combo_oracle_te, ece_combo_unsup_te, ece_combo_semisup_te])
158 |
159 |
160 |
161 | if __name__ == '__main__':
162 | out_fpath = './output/'
163 | if os.path.exists(out_fpath):
164 | print('Output filepath exists, dont overwrite it!')
165 | quit()
166 | run_experiment(out_fpath)
167 |
--------------------------------------------------------------------------------
/experiments/weighted_semisup_em_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 |
4 | from data_utils import *
5 | from combination_methods import *
6 | from tqdm.auto import tqdm
7 | from sklearn.model_selection import train_test_split
8 | import calibration as cal
9 | import csv
10 | import os.path
11 |
12 | # note: these experiments do not appear in our paper and may contain outdated code.
13 |
14 |
15 | def run_experiment(out_fpath=None):
16 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on Noisy ImageNet
17 |
18 | Done in a semi-supervised fashion.
19 | """
20 | assert out_fpath is not None, 'Must specify output filepath'
21 |
22 | unsupervised_weights = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
23 |
24 | # Create CSV output file if needed, write header
25 | with open(out_fpath, 'a', newline='') as f:
26 | writer = csv.writer(f)
27 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 'n_l', 'n_u',
28 | 'acc_h_te', 'acc_m_te',
29 | 'acc_comb_oracle_te', 'acc_comb_unsup_te']
30 | + ['acc_comb_semisup_te_weight{}'.format(v) for v in unsupervised_weights] +
31 | ['ece_m_te', 'ece_m_calibrated_oracle_te', 'ece_m_calibrated_unsup_te']
32 | + ['ece_m_calibrated_semisup_te_weight{}'.format(v) for v in unsupervised_weights] +
33 | ['ece_combo_oracle_te', 'ece_combo_unsup_te']
34 | + ['ece_combo_semisup_te_weight{}'.format(v) for v in unsupervised_weights])
35 |
36 | # Experiment parameters
37 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19']
38 | model_names = ['densenet161']
39 | noise_levels = [80, 95, 110, 125]
40 | # epochs = [None, 0, 1, 10]
41 | epochs = [None, 0, 10]
42 | noise_type = 'phase'
43 |
44 | n_runs = 10
45 |
46 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True):
47 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'):
48 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'):
49 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet(noise_level, model_name,
50 | n_epochs=epoch, noise_type=noise_type)
51 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'):
52 | # Train/test split 70/30
53 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
54 | y_h, model_probs, y_true, test_size=0.3, random_state=i)
55 |
56 | n_tr = y_h_tr.size # Number of training points
57 |
58 | # Evaluate accuracies of things that don't change with n_l
59 | acc_h_te = np.mean(y_h_te == y_true_te)
60 | y_m_te = np.argmax(model_probs_te, axis=1)
61 | acc_m_te = np.mean(y_m_te == y_true_te)
62 | # Evaluate calibration of things that don't change with n_l
63 | ece_m_te = cal.get_ece(model_probs_te, y_true_te)
64 |
65 | # ----- Unsupervised EM, all unlabeled data
66 | # Edge case with n_l = 0
67 | unsupervised_EM_combiner = UnsupervisedEMCombiner()
68 | unsupervised_EM_combiner.fit(model_probs_tr, y_h_tr)
69 | y_comb_prob_unsup_te = unsupervised_EM_combiner.combine_proba(model_probs_te, y_h_te)
70 | y_comb_unsup_te = unsupervised_EM_combiner.combine(model_probs_te, y_h_te)
71 |
72 | acc_combo_unsup_all_te = np.mean(y_comb_unsup_te == y_true_te)
73 | ece_m_calibrated_unsup_all_te = cal.get_ece(unsupervised_EM_combiner.calibrate(model_probs_te),
74 | y_true_te)
75 | ece_combo_unsup_all_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te)
76 |
77 | with open(out_fpath, 'a', newline='') as f:
78 | writer = csv.writer(f)
79 | writer.writerow([model_name, noise_level, epoch, i, 0, n_tr,
80 | acc_h_te, acc_m_te,
81 | None, acc_combo_unsup_all_te] + [None] * len(unsupervised_weights) +
82 | [ece_m_te, None, ece_m_calibrated_unsup_all_te]
83 | + [None] * len(unsupervised_weights) +
84 | [None, ece_combo_unsup_all_te]
85 | + [None] * len(unsupervised_weights))
86 |
87 | # ----- Fully supervised (oracle) combo
88 | # Edge case with n_l = all
89 | oracle_combiner = OracleCombiner()
90 | oracle_combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
91 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te)
92 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te)
93 |
94 | acc_combo_oracle_all_te = np.mean(y_comb_oracle_te == y_true_te)
95 | ece_m_calibrated_oracle_all_te = cal.get_ece(oracle_combiner.calibrate(model_probs_te),
96 | y_true_te)
97 | ece_combo_oracle_all_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te)
98 | with open(out_fpath, 'a', newline='') as f:
99 | writer = csv.writer(f)
100 | writer.writerow([model_name, noise_level, epoch, i, n_tr, 0,
101 | acc_h_te, acc_m_te,
102 | acc_combo_oracle_all_te, None] + [None] * len(unsupervised_weights) +
103 | [ece_m_te, ece_m_calibrated_oracle_all_te, None]
104 | + [None] * len(unsupervised_weights) +
105 | [ece_combo_oracle_all_te, None]
106 | + [None] * len(unsupervised_weights))
107 |
108 | n_l_sizes = [10, 50, 100, 250, 500, 1000, 2500, 4500] # Amount of labeled data to use
109 | for n_l in tqdm(n_l_sizes, leave=False, desc='Num. Labels'):
110 | # Split into labeled / unlabeled datasets
111 | n_u = n_tr - n_l
112 | y_h_tr_u, y_h_tr_l = y_h_tr[n_l:], y_h_tr[:n_l]
113 | model_probs_tr_u, model_probs_tr_l = model_probs_tr[n_l:], model_probs_tr[:n_l]
114 | y_true_tr_l = y_true_tr[:n_l]
115 |
116 | # ----- Labeled data only
117 | oracle_combiner = OracleCombiner()
118 | oracle_combiner.fit(model_probs_tr_l, y_h_tr_l, y_true_tr_l)
119 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te)
120 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te)
121 | model_probs_calibrated_oracle_te = oracle_combiner.calibrate(model_probs_te)
122 |
123 | # ----- Semi-Supervised EM
124 | # Fit once for each choice in unsupervised_weights
125 | y_comb_prob_semisup_te = dict.fromkeys(unsupervised_weights)
126 | y_comb_semisup_te = dict.fromkeys(unsupervised_weights)
127 | model_probs_calibrated_semisup_te = dict.fromkeys(unsupervised_weights)
128 | for unsupervised_weight in tqdm(unsupervised_weights, leave=False, desc='Weights'):
129 | semisup_combiner = SemiSupervisedEMCombiner(unsupervised_weight=unsupervised_weight)
130 | semisup_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l)
131 | y_comb_prob_semisup_te[unsupervised_weight] = semisup_combiner.combine_proba(model_probs_te,
132 | y_h_te)
133 | y_comb_semisup_te[unsupervised_weight] = semisup_combiner.combine(model_probs_te, y_h_te)
134 | model_probs_calibrated_semisup_te[unsupervised_weight] = semisup_combiner.calibrate(
135 | model_probs_te)
136 |
137 | # ----- Unsupervised EM
138 | unsup_combiner = UnsupervisedEMCombiner()
139 | unsup_combiner.fit(model_probs_tr_u, y_h_tr_u)
140 | y_comb_prob_unsup_te = unsup_combiner.combine_proba(model_probs_te, y_h_te)
141 | y_comb_unsup_te = unsup_combiner.combine(model_probs_te, y_h_te)
142 | model_probs_calibrated_unsup_te = unsup_combiner.calibrate(model_probs_te)
143 |
144 | # ----- Evaluate accuracies
145 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te)
146 | acc_comb_semisup_te = dict.fromkeys(unsupervised_weights)
147 | for unsupervised_weight in unsupervised_weights:
148 | acc_comb_semisup_te[unsupervised_weight] = np.mean(
149 | y_comb_semisup_te[unsupervised_weight] == y_true_te)
150 | acc_comb_unsup_te = np.mean(y_comb_unsup_te == y_true_te)
151 |
152 | # ----- Evaluate calibration
153 | # Evaluate ECE of just model
154 | ece_m_calibrated_oracle_te = cal.get_ece(model_probs_calibrated_oracle_te, y_true_te)
155 | ece_m_calibrated_unsup_te = cal.get_ece(model_probs_calibrated_unsup_te, y_true_te)
156 | ece_m_calibrated_semisup_te = dict.fromkeys(unsupervised_weights)
157 | for unsupervised_weight in unsupervised_weights:
158 | ece_m_calibrated_semisup_te[unsupervised_weight] = cal.get_ece(
159 | model_probs_calibrated_semisup_te[unsupervised_weight], y_true_te)
160 |
161 | # Evaluate ECE of combination
162 | ece_combo_oracle_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te)
163 | ece_combo_unsup_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te)
164 | ece_combo_semisup_te = dict.fromkeys(unsupervised_weights)
165 | for unsupervised_weight in unsupervised_weights:
166 | ece_combo_semisup_te[unsupervised_weight] = cal.get_ece(
167 | y_comb_prob_semisup_te[unsupervised_weight], y_true_te)
168 |
169 | # Write results to CSV
170 | with open(out_fpath, 'a', newline='') as f:
171 | writer = csv.writer(f)
172 | writer.writerow([model_name, noise_level, epoch, i, n_l, n_u,
173 | acc_h_te, acc_m_te,
174 | acc_comb_oracle_te, acc_comb_unsup_te]
175 | + [v for v in acc_comb_semisup_te.values()] +
176 | [ece_m_te, ece_m_calibrated_oracle_te, ece_m_calibrated_unsup_te]
177 | + [v for v in ece_m_calibrated_semisup_te.values()] +
178 | [ece_combo_oracle_te, ece_combo_unsup_te]
179 | + [v for v in ece_combo_semisup_te.values()])
180 |
181 |
182 | if __name__ == '__main__':
183 | out_fpath = './output/'
184 | if os.path.exists(out_fpath):
185 | print('Output filepath exists, dont overwrite it!')
186 | quit()
187 | run_experiment(out_fpath)
188 |
--------------------------------------------------------------------------------
/imax_calib/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2019 Robert Bosch GmbH
3 | #
4 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Affero General Public License as published
7 | # by the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program. If not, see .
17 | #
18 | # Author: Kanil Patel
19 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/imax_calib/calibration.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
3 | # All rights reserved.
4 | ##
5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Affero General Public License as published
8 | # by the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Affero General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Affero General Public License
17 | # along with this program. If not, see .
18 | #
19 | # Author: Kanil Patel
20 | # -*- coding: utf-8 -*-
21 | '''
22 | calibration.py
23 | imax_calib
24 |
25 | Created by Kanil Patel on 07/28/20.
26 | Copyright 2020. Kanil Patel. All rights reserved.
27 | '''
28 | import os
29 | import numpy as np
30 | import imax_calib.io as io
31 | import imax_calib.utils as utils
32 | import imax_calib.calibrators.binners as binners
33 | import imax_calib.calibrators.scalers_np as scalers_np
34 |
35 | def learn_calibrator(cfg, logits, logodds, y, feats=None, **kwargs):
36 | """
37 | Use this function to access all calibrators (binning).
38 | Inputs are the raw network logits and one-hot labels.
39 | The kwargs can be used to send other arguments which some calibrators might need.
40 |
41 | Parameters
42 | ----------
43 | cfg: io.AttrDict
44 | config dictionary containing all information.
45 | logits: numpy ndarray
46 | raw network logits
47 | logodds: numpy ndarray
48 | raw network logodds. use utils.quick_logits_to_logodds(logits) to get them
49 | y: numpy ndarray
50 | one-hot target labels
51 | kwargs: dict
52 | extra arguments which some calibrators require
53 | Returns
54 | -------
55 |
56 | cal_obj: calibrators_*.BaseCalibrator
57 | calibrator object. can be used given logits as input
58 | """
59 | binner_obj = learn_binning(cfg, logits, logodds, y, **kwargs)
60 | return binner_obj
61 |
62 | def learn_binning(cfg, logits, logodds, y, **kwargs):
63 | """
64 | Same as learn_calibrator() but this func specifically learns the logodds binning methods.
65 | """
66 | # set all seeds
67 | np.random.seed(cfg.Q_rnd_seed)
68 |
69 | if cfg.Q_method is None:
70 | CALIBRATOR = scalers_np.Raw
71 | elif cfg.Q_method=="imax" or cfg.Q_method=="eqmass" or cfg.Q_method=="eqsize":
72 | if cfg.cal_setting=="CW":
73 | CALIBRATOR = binners.HistogramBinninerCW
74 | elif cfg.cal_setting=="top1":
75 | CALIBRATOR = binners.HistogramBinninerTop1
76 | elif cfg.cal_setting=="sCW":
77 | CALIBRATOR = binners.HistogramBinninerSharedCW
78 | else:
79 | raise Exception("Quantization method unknown!")
80 |
81 | cal_obj = CALIBRATOR(cfg)
82 | #print("Learning calibration parameters!")
83 | cal_obj.fit(logits, logodds, y, **kwargs)
84 | return cal_obj
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/imax_calib/calibrators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GavinKerrigan/conf_matrix_and_calibration/3b9cf13df58861a87549fe5d36017c1387aceb08/imax_calib/calibrators/__init__.py
--------------------------------------------------------------------------------
/imax_calib/calibrators/scalers_np.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
3 | # All rights reserved.
4 | ###
5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Affero General Public License as published
8 | # by the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Affero General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Affero General Public License
17 | # along with this program. If not, see .
18 | #
19 | # Author: Kanil Patel
20 | # -*- coding: utf-8 -*-
21 | '''
22 | calibrators_np.py
23 | imax_calib
24 |
25 | All calibration methods which require numpy functions during learning of parameters.
26 |
27 | Created by Kanil Patel on 07/27/20.
28 | Copyright 2020. Kanil Patel. All rights reserved.
29 | '''
30 | import numpy as np
31 | import imax_calib.io as io
32 | import imax_calib.utils as utils
33 |
34 | class BaseCalibrator():
35 | """
36 | A generic base class.
37 | """
38 | def __init__(self):
39 | self.parameter_list = []
40 |
41 | def fit(self, logits, logodds, y, **kwargs):
42 | """
43 | Function to learn the model parameters using the input data X and labels y.
44 |
45 | Parameters
46 | ----------
47 | logits: numpy ndarray
48 | input data to the calibrator.
49 | logodds: numpy ndarray
50 | input data to the calibrator.
51 | y: numpy ndarray
52 | target labels
53 | Returns
54 | -------
55 |
56 | """
57 | raise NotImplementedError("Subclass must implement this method.")
58 |
59 | def calibrate(self, logits, logodds, **kwargs):
60 | """
61 | Calibrate the data using the learned parameters after fit was already called.
62 | """
63 | raise NotImplementedError("Subclass must implement this method.")
64 |
65 | def __call__(self, *args, **kwargs):
66 | return self.calibrate(*args, **kwargs)
67 |
68 | def save_params(self, fpath):
69 | """
70 | Save the parameters of the model. The parameters which need to be saved are determined by self.parameter_list.
71 | Saves a single hdf5 file with keys being the parameter names.
72 |
73 | Parameters
74 | ----------
75 | fpath: string
76 | filepath to save the hdf5 file with model parameters
77 | Returns
78 | -------
79 | """
80 | if len(self.parameter_list)>0:
81 | data_to_save = io.AttrDict()
82 | for key in self.parameter_list:
83 | data_to_save[key] = getattr(self, key)
84 | io.deepdish_write(fpath, data_to_save)
85 | print(io.pc._OKGREEN("Parameters written to fpath: %s"%(fpath)))
86 |
87 | def load_params(self, fpath):
88 | """
89 | Load the parameters of the model. The parameters which need to be loaded are determined by self.parameter_list.
90 | Loads a single hdf5 file and assigns the attributes to the object using keys as the parameter names.
91 |
92 | Parameters
93 | ----------
94 | fpath: string
95 | filepath to save the hdf5 file with model parameters
96 | Returns
97 | -------
98 | """
99 | if len(self.parameter_list)>0:
100 | data_to_load = io.deepdish_read(fpath)
101 | for key in self.parameter_list:
102 | setattr(self, key, data_to_load[key])
103 | print(io.pc._OKGREEN("Parameters loaded and updated from fpath: %s"%(fpath)))
104 |
105 |
106 |
107 |
108 | class Raw(BaseCalibrator):
109 | """
110 | The raw outputs without any calibration. Identity function.
111 | """
112 | def __init__(self, cfg=None):
113 | super(Raw).__init__()
114 |
115 | def fit(self, logits, logodds, y, **kwargs):
116 | return self
117 |
118 | def calibrate(self, logits, logodds, **kwargs):
119 | probs = utils.to_sigmoid(logodds)
120 | return logits, logodds, probs
121 |
122 | def load_params(self, fpath):
123 | return None
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/imax_calib/clustering.py:
--------------------------------------------------------------------------------
1 | # This source code is from sklearn.cluster.kmean (https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)
2 | # Copyright (c) 2012-2014 Awesome Inc.
3 | # This source code is licensed under the MIT license found in the
4 | # 3rd-party-licenses.txt file in the root directory of this source tree.
5 |
6 | # Initialization heuristic, copied from sklearn.cluster.kmean
7 | import numpy as np
8 | from numpy.random import RandomState
9 | import scipy.sparse as sp
10 | from scipy import stats
11 | from sklearn.utils.extmath import stable_cumsum, row_norms
12 | from sklearn.metrics.pairwise import euclidean_distances
13 |
14 | def CE_mtx(logits_p_in, logits_q_in):
15 | logits_p = np.reshape(logits_p_in.astype(np.float64), [logits_p_in.shape[0], 1])
16 | logits_q = np.reshape(logits_q_in.astype(np.float64), [1, logits_q_in.shape[0]])
17 | CE_mtx = - logits_q * (0.5 + 0.5*np.tanh(logits_p/2.)) + np.maximum(0., logits_q) + np.log(1. + np.exp(-abs(logits_q)))
18 | return CE_mtx
19 |
20 | def KL_mtx(logits_p_in, logits_q_in):
21 | logits_p = np.reshape(logits_p_in.astype(np.float64), [logits_p_in.shape[0], 1])
22 | logits_q = np.reshape(logits_q_in.astype(np.float64), [1, logits_q_in.shape[0]])
23 | KL_mtx = (logits_p - logits_q) * (0.5 + 0.5*np.tanh(logits_p/2.)) + np.maximum(0., logits_q) + np.log(1. + np.exp(-abs(logits_q))) - np.maximum(0., logits_p) - np.log(1. + np.exp(-abs(logits_p)))
24 | #KL_mtx = - logits_q * (0.5 + 0.5*np.tanh(logits_p/2.)) + np.maximum(0., logits_q) + np.log(1. + np.exp(-abs(logits_q)))
25 | return KL_mtx
26 |
27 | def JSD_mtx(logits_p, logits_q):
28 | logits_p_a = np.reshape(logits_p.astype(np.float64), [logits_p.shape[0], 1])
29 | logits_q_a = np.reshape(logits_q.astype(np.float64), [1, logits_q.shape[0]])
30 | logits_q_a = logits_q_a * 0.5 + 0.5 * logits_p_a
31 | KL_mtx_a = (logits_p_a - logits_q_a) * (0.5 + 0.5*np.tanh(logits_p_a/2.)) + np.maximum(0., logits_q_a) + np.log(1. + np.exp(-abs(logits_q_a))) - np.maximum(0., logits_p_a) - np.log(1. + np.exp(-abs(logits_p_a)))
32 |
33 | logits_p_b = np.reshape(logits_p.astype(np.float64), [1, logits_p.shape[0]])
34 | logits_q_b = np.reshape(logits_q.astype(np.float64), [logits_q.shape[0], 1])
35 | logits_p_b = logits_q_b * 0.5 + 0.5 * logits_p_b
36 | KL_mtx_b = (logits_q_b - logits_p_b) * (0.5 + 0.5*np.tanh(logits_q_b/2.)) + np.maximum(0., logits_p_b) + np.log(1. + np.exp(-abs(logits_p_b))) - np.maximum(0., logits_q_b) - np.log(1. + np.exp(-abs(logits_q_b)))
37 | return KL_mtx_a * 0.5 + KL_mtx_b.transpose()*0.5
38 |
39 |
40 |
41 |
42 | def kmeans_pp_init(X, n_clusters, random_state, n_local_trials=None, mode = 'jsd'):
43 | """Init n_clusters seeds according to k-means++
44 |
45 | Parameters
46 | ----------
47 | X : array or sparse matrix, shape (n_samples, n_features)
48 | The data to pick seeds for. To avoid memory copy, the input data
49 | should be double precision (dtype=np.float64).
50 |
51 | n_clusters : integer
52 | The number of seeds to choose
53 |
54 | x_squared_norms : array, shape (n_samples,)
55 | Squared Euclidean norm of each data point.
56 |
57 | random_state : int, RandomState instance
58 | The generator used to initialize the centers. Use an int to make the
59 | randomness deterministic.
60 | See :term:`Glossary `.
61 |
62 | n_local_trials : integer, optional
63 | The number of seeding trials for each center (except the first),
64 | of which the one reducing inertia the most is greedily chosen.
65 | Set to None to make the number of trials depend logarithmically
66 | on the number of seeds (2+log(k)); this is the default.
67 |
68 | Notes
69 | -----
70 | Selects initial cluster centers for k-mean clustering in a smart way
71 | to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
72 | "k-means++: the advantages of careful seeding". ACM-SIAM symposium
73 | on Discrete algorithms. 2007
74 |
75 | Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
76 | which is the implementation used in the aforementioned paper.
77 | """
78 | n_samples, n_features = X.shape
79 | random_state = np.random.RandomState(random_state)
80 | centers = np.empty((n_clusters, n_features), dtype=X.dtype)
81 | center_ids = np.empty((n_clusters,), dtype=np.int64)
82 |
83 | #assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
84 | x_squared_norms = row_norms(X, squared=True)
85 | # Set the number of local seeding trials if none is given
86 | if n_local_trials is None:
87 | # This is what Arthur/Vassilvitskii tried, but did not report
88 | # specific results for other than mentioning in the conclusion
89 | # that it helped.
90 | n_local_trials = 2 + int(np.log(n_clusters))
91 |
92 | # Pick first center randomly
93 | center_id = random_state.randint(n_samples)
94 | #test_id = random_state.randint(n_samples)
95 | #assert test_id != center_id:
96 | center_ids[0] = center_id
97 | if sp.issparse(X):
98 | centers[0] = X[center_id].toarray()
99 | else:
100 | centers[0] = X[center_id]
101 |
102 | # Initialize list of closest distances and calculate current potential
103 | if mode == 'euclidean':
104 | closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True)
105 | elif mode == 'kl':
106 | #def KL_div(logits_p, logits_q):
107 | # assert logits_p.shape[1] == 1 or logits_q.shape[1] == 1
108 | # return (logits_p - logits_q) * (np.tanh(logits_p/2.) * 0.5 + 0.5) + np.maximum(logits_q, 0.) + np.log(1.+np.exp(-abs(logits_q))) + np.maximum(logits_p, 0.) + np.log(1.+np.exp(-abs(logits_p)))
109 | closest_dist_sq = KL_mtx(X[:,0], centers[0]).transpose()
110 | elif mode == 'ce':
111 | closest_dist_sq = CE_mtx(X[:,0], centers[0]).transpose()
112 | elif mode == 'jsd':
113 | closest_dist_sq = JSD_mtx(X[:,0], centers[0]).transpose()
114 | else:
115 | raise ValueError("Unknown distance in Kmeans++ initialization")
116 |
117 | current_pot = closest_dist_sq.sum()
118 |
119 | # Pick the remaining n_clusters-1 points
120 | for c in range(1, n_clusters):
121 | # Choose center candidates by sampling with probability proportional
122 | # to the squared distance to the closest existing center
123 | rnd_samples = random_state.random_sample(n_local_trials)
124 | test1 = random_state.random_sample(n_local_trials)
125 | rand_vals = rnd_samples * current_pot
126 | assert np.any(abs(test1 - rnd_samples) > 1e-4)
127 |
128 | candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals)
129 | # XXX: numerical imprecision can result in a candidate_id out of range
130 | np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids)
131 |
132 | # Compute distances to center candidates
133 | if mode == 'euclidean':
134 | distance_to_candidates = euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
135 | elif mode == 'ce':
136 | distance_to_candidates = CE_mtx(X[:,0], X[candidate_ids,0]).transpose()
137 | elif mode == 'kl':
138 | distance_to_candidates = KL_mtx(X[:,0], X[candidate_ids,0]).transpose()
139 | else:
140 | distance_to_candidates = JSD_mtx(X[:,0], X[candidate_ids,0]).transpose()
141 | # update closest distances squared and potential for each candidate
142 | np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates)
143 | candidates_pot = distance_to_candidates.sum(axis=1)
144 |
145 | # Decide which candidate is the best
146 | best_candidate = np.argmin(candidates_pot)
147 | current_pot = candidates_pot[best_candidate]
148 | closest_dist_sq = distance_to_candidates[best_candidate]
149 | best_candidate = candidate_ids[best_candidate]
150 | center_ids[c] = best_candidate
151 | # Permanently add best center candidate found in local tries
152 | if sp.issparse(X):
153 | centers[c] = X[best_candidate].toarray()
154 | else:
155 | centers[c] = X[best_candidate]
156 |
157 | return centers, center_ids
158 |
159 |
160 |
161 |
162 |
163 |
--------------------------------------------------------------------------------
/imax_calib/evaluations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GavinKerrigan/conf_matrix_and_calibration/3b9cf13df58861a87549fe5d36017c1387aceb08/imax_calib/evaluations/__init__.py
--------------------------------------------------------------------------------
/imax_calib/evaluations/calibration_metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
3 | # All rights reserved.
4 | ###
5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Affero General Public License as published
8 | # by the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Affero General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Affero General Public License
17 | # along with this program. If not, see .
18 | #
19 | # Author: Kanil Patel
20 | # -*- coding: utf-8 -*-
21 | '''
22 | calibration_metrics.py
23 | evaluations
24 |
25 | Created by Kanil Patel on 07/27/20.
26 | Copyright 2020. Kanil Patel. All rights reserved.
27 | '''
28 | import numpy as np
29 | import imax_calib.hb_utils as hb_utils
30 | import imax_calib.utils as utils
31 | import imax_calib.io as io
32 | from scipy.cluster.vq import kmeans,vq
33 | import scipy.cluster.vq
34 | import os
35 | import contextlib
36 |
37 | from imax_calib.calibrators.binners import run_imax
38 |
39 |
40 |
41 | def compute_top_1_and_CW_ECEs(multi_cls_probs, multi_cls_labels, list_approximators=["dECE", "mECE", "iECE", "kECE"], num_bins=100, threshold_mode='class'):
42 | """
43 | Given the multi-class predictions and labels, this function computes the top1 and CW ECEs. Will compute it by calling the other functions in this script.
44 |
45 | Parameters:
46 | -----------
47 | multi_cls_probs: 2D ndarray
48 | predicted probabilities
49 | multi_cls_labels: 1D or 2D ndarray
50 | label indices or one-hot labels. Will be converted to one-hot
51 |
52 | Return:
53 | -------
54 | ece_dict: dict
55 | Dictionary with all the ECE estimates
56 |
57 | """
58 | assert len(multi_cls_probs.shape)==2
59 | if len(multi_cls_labels.shape)==1: # not one-hot. so convert to one-hot
60 | multi_cls_labels = np.eye(multi_cls_probs.shape[1])[multi_cls_labels]
61 |
62 | ece_evals_dict = io.AttrDict({})
63 |
64 | n_classes = multi_cls_probs.shape[1]
65 | for ece_approx in list_approximators:
66 | top_1_preds = multi_cls_probs.max(axis=-1)
67 | top_1_correct=multi_cls_probs.argmax(axis=-1) == multi_cls_labels.argmax(axis=-1)
68 |
69 | top_1_ECE = eval("measure_%s_calibration"%(ece_approx))(pred_probs=top_1_preds, correct=top_1_correct, num_bins=num_bins)["ece"]
70 |
71 | cw_ECEs = []
72 | if threshold_mode == 'class':
73 | threshold = 1.0/n_classes
74 | elif threshold_mode is None:
75 | threshold = 0.
76 | for class_idx in range(n_classes):
77 | cw_ECE = eval("measure_%s_calibration"%(ece_approx))(pred_probs=multi_cls_probs[:, class_idx],
78 | correct=multi_cls_labels[:, class_idx],
79 | num_bins=num_bins, threshold=threshold)["ece"]
80 | cw_ECEs.append(cw_ECE)
81 | mean_cw_ECE = np.mean(cw_ECEs)
82 |
83 | ece_evals_dict["top_1_%s"%(ece_approx)] = top_1_ECE
84 | ece_evals_dict["cw_%s"%(ece_approx)] = mean_cw_ECE
85 |
86 | return ece_evals_dict
87 |
88 |
89 | def _ece(avg_confs, avg_accs, counts):
90 | """
91 | Helper function to compute the Expected Calibration Error.
92 |
93 | Parameters
94 | ----------
95 | avg_confs: Averaged probability of predictions per bin (confidence)
96 | avg_accs: Averaged true accuracy of predictions per bin
97 | counts: Number of predictions per bin
98 |
99 | Returns
100 | -------
101 | ece: float - calibration error
102 | """
103 | return np.sum((counts / counts.sum()) * np.absolute(avg_confs- avg_accs))
104 |
105 |
106 | def measure_iECE_calibration(pred_probs, correct, num_bins, threshold=-1):
107 | """
108 | Compute the calibration curve using I-Max binning scheme. This will run the I-Max algorithm on the TEST set and get the bin boundaries.
109 |
110 | Parameters
111 | ----------
112 | y: numpy binary array
113 | label indicating if sample is positive or negative
114 |
115 | for rest see calibration_error_and_curve()
116 |
117 | Returns
118 | -------
119 | see calibration_error_and_curve()
120 |
121 | """
122 | #print("Running iECE calc.: calling I-Max now!")
123 | logodds = utils.to_logodds(pred_probs)
124 | with open(os.devnull, "w") as f, contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):
125 | logdata = run_imax(logodds, correct, num_bins, log_every_steps=None, logfpath=None )
126 | bin_boundaries = logdata.bin_boundaries[-1]
127 | assigned = hb_utils.bin_data(logodds, bin_boundaries)
128 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold)
129 |
130 | def measure_dECE_calibration(pred_probs, correct, num_bins=100, threshold=-1):
131 | """
132 | Compute the calibration curve using the equal size binning scheme (i.e. equal size bins)and computes the calibration error given this binning scheme (i.e. dECE).
133 |
134 | Parameters
135 | ----------
136 | see calibration_error_and_curve()
137 | Returns
138 | -------
139 | see calibration_error_and_curve()
140 |
141 | """
142 | assert len(pred_probs.shape)==1
143 | bin_boundaries_prob = utils.to_sigmoid( hb_utils.nolearn_bin_boundaries(num_bins, binning_scheme="eqsize") )
144 | assigned = hb_utils.bin_data(pred_probs, bin_boundaries_prob)
145 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold)
146 |
147 |
148 | def measure_mECE_calibration(pred_probs, correct, num_bins=100, threshold=-1):
149 | """
150 | Compute the calibration curve using the equal mass binning scheme (i.e. equal mass bins)and computes the calibration error given this binning scheme (i.e. mECE).
151 |
152 | Parameters
153 | ----------
154 | see calibration_error_and_curve()
155 | Returns
156 | -------
157 | see calibration_error_and_curve()
158 | """
159 | assert len(pred_probs.shape)==1
160 | logodds = utils.to_logodds(pred_probs)
161 | #if logodds.max()<=1 and logodds.min()>=0:
162 | bin_boundaries_prob = utils.to_sigmoid( hb_utils.nolearn_bin_boundaries(num_bins, binning_scheme="eqmass", x=logodds) )
163 | assigned = hb_utils.bin_data(pred_probs, bin_boundaries_prob)
164 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold)
165 |
166 | def measure_kECE_calibration(pred_probs, correct, num_bins=100, threshold=-1):
167 | """
168 | Compute the calibration curve using the kmeans binning scheme (i.e. use kmeans to cluster the data and then determine the bin assignments) and computes the calibration error given this binning scheme (i.e. kECE).
169 |
170 | Parameters
171 | ----------
172 | see calibration_error_and_curve()
173 | Returns
174 | -------
175 | see calibration_error_and_curve()
176 | """
177 |
178 | assert len(pred_probs.shape)==1
179 | centroids,_ = scipy.cluster.vq.kmeans(pred_probs, num_bins)
180 | cluster_ids, _ = scipy.cluster.vq.vq(pred_probs, centroids)
181 | cluster_ids = cluster_ids.astype(np.int)
182 | return calibration_error_and_curve(pred_probs, correct, cluster_ids, num_bins, threshold)
183 |
184 |
185 | def measure_quantized_calibration(pred_probs, correct, assigned, num_bins=100, threshold=-1):
186 | """
187 | Compute the calibration curve given the bin assignments (i.e. quantized values).
188 | """
189 | assert len(pred_probs.shape)==1
190 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold)
191 |
192 |
193 | def calibration_error_and_curve(pred_probs, correct, assigned, num_bins=100, threshold=-1):
194 | """
195 | Compute the calibration curve and calibration error. The threshold float will determine which samples to ignore because its confidence is very low.
196 |
197 | Parameters
198 | ----------
199 | see calibration_curve_quantized()
200 |
201 | Returns
202 | -------
203 | results: dict
204 | dictionary with calibration information
205 | """
206 | assert len(pred_probs.shape)==1
207 | mask = pred_probs>threshold
208 | pred_probs, correct, assigned = pred_probs[mask], correct[mask], assigned[mask]
209 | cov = mask.mean()
210 | prob_pred, prob_true, counts, counts_unfilt = calibration_curve_quantized(pred_probs, correct, assigned=assigned, num_bins=num_bins)
211 | ece = _ece(prob_pred, prob_true, counts)
212 | return {"ece": ece, "prob_pred":prob_pred, "prob_true":prob_true, "counts":counts, "counts_unfilt":counts_unfilt, "threshold":threshold, "cov":cov}
213 |
214 |
215 | def calibration_curve_quantized(pred_probs, correct, assigned, num_bins=100):
216 | """
217 | Get the calibration curve given the bin assignments, samples and sample-correctness.
218 |
219 | Parameters
220 | ----------
221 | pred_probs: numpy ndarray
222 | numpy array with predicted probabilities (i.e. confidences)
223 | correct: numpy ndarray
224 | 0/1 indicating if the sample was correctly classified or not
225 | num_bins: int
226 | number of bins for quantization
227 | Returns
228 | -------
229 | prob_pred: for each bin the avg. confidence
230 | prob_true: for each bin the avg. accuracy
231 | counts: number of samples in each bin
232 | counts_unfilt: same as `counts` but also including zero bins
233 | """
234 | assert len(pred_probs.shape)==1
235 | bin_sums_pred = np.bincount(assigned, weights=pred_probs, minlength=num_bins)
236 | bin_sums_true = np.bincount(assigned, weights=correct, minlength=num_bins)
237 | counts = np.bincount(assigned, minlength=num_bins)
238 | filt = counts > 0
239 | prob_pred = (bin_sums_pred[filt] / counts[filt])
240 | prob_true = (bin_sums_true[filt] / counts[filt])
241 | counts_unfilt = counts
242 | counts = counts[filt]
243 | return prob_pred, prob_true, counts, counts_unfilt
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
--------------------------------------------------------------------------------
/imax_calib/hb_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
3 | # All rights reserved.
4 | ##
5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Affero General Public License as published
8 | # by the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Affero General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Affero General Public License
17 | # along with this program. If not, see .
18 | #
19 | # Author: Kanil Patel
20 | # -*- coding: utf-8 -*-
21 | '''
22 | hb_utils.py
23 | imax_calib
24 |
25 | Contains all util functions for any histogram binning (hb) operations.
26 |
27 | Created by Kanil Patel on 07/27/20.
28 | Copyright 2020. Kanil Patel. All rights reserved.
29 | '''
30 | import numpy as np
31 | import scipy; import scipy.stats; import scipy.integrate as integrate
32 | import imax_calib.utils as utils
33 | import imax_calib.io as io
34 |
35 | ##################
36 | # Binning utils
37 | #################
38 | def nolearn_bin_boundaries(num_bins, binning_scheme, x=None):
39 | """
40 | Get the bin boundaries (in logit space) of the bins. This function returns only the bin boundaries which do not include any type of learning.
41 | For example: equal mass bins, equal size bins or overlap bins.
42 |
43 | Parameters
44 | ----------
45 | num_bins: int
46 | Number of bins
47 | binning_scheme: string
48 | The way the bins should be placed.
49 | 'eqmass': each bin has the same portion of samples assigned to it. Requires that `x is not None`.
50 | 'eqsize': equal spaced bins in `probability` space. Will get equal spaced bins in range [0,1] and then convert to logodds.
51 | 'custom_range[min_lambda,max_lambda]': equal spaced bins in `logit` space given some custom range.
52 | x: numpy array (1D,)
53 | array with the 1D data to determine the eqmass bins.
54 |
55 | Returns
56 | -------
57 | bins: numpy array (num_bins-1,)
58 | Returns the bin boundaries. It will return num_bins-1 bin boundaries in logit space. Open ended range on both sides.
59 | """
60 | if binning_scheme=="eqmass":
61 | assert x is not None and len(x.shape)==1
62 | bins = np.linspace(1.0/num_bins, 1 - 1.0 / num_bins, num_bins-1) # num_bins-1 boundaries for open ended sides
63 | bins = np.percentile(x, bins * 100, interpolation='lower') # data will ensure its in Logit space
64 | elif binning_scheme=="eqsize": # equal spacing in logit space is not the same in prob space because of sigmoid non-linear transformation
65 | bins = utils.to_logodds( np.linspace(1.0/num_bins, 1 - 1.0 / num_bins, num_bins-1) ) # num_bins-1 boundaries for open ended sides
66 | elif "custom_range" in binning_scheme: # used for example when you want bins at overlap regions. then custom range should be [ min p(y=1), max p(y=0) ]. e.g. custom_range[-5,8]
67 | custom_range = eval(binning_scheme.replace("custom_range", ""))
68 | assert type(custom_range)==list and (custom_range[0] <= custom_range[1])
69 | bins = np.linspace(custom_range[0], custom_range[1], num_bins-1) # num_bins-1 boundaries for open ended sides
70 | return bins
71 |
72 | def bin_data(x, bins):
73 | """
74 | Given bin boundaries quantize the data (x). When ndims(x)>1 it will flatten the data, quantize and then reshape back to orig shape.
75 | Returns the following quantized values for num_bins=10 and bins = [2.5, 5.0, 7.5, 1.0]\n
76 | quantize: \n
77 | (-inf, 2.5) -> 0\n
78 | [2.5, 5.0) -> 1\n
79 | [5.0, 7.5) -> 2\n
80 | [7.5, 1.0) -> 3\n
81 | [1.0, inf) -> 4\n
82 |
83 | Parameters
84 | ----------
85 | x: numpy ndarray
86 | Network logits as numpy array
87 | bins: numpy ndarray
88 | location of the (num_bins-1) bin boundaries
89 |
90 | Returns
91 | -------
92 | assigned: int numpy ndarray
93 | For each sample, this contains the bin id (0-indexed) to which the sample belongs.
94 | """
95 | orig_shape = x.shape
96 | # if not 1D data. so need to reshape data, then quantize, then reshape back
97 | if len(orig_shape)>1 or orig_shape[-1]!=1: x = x.flatten()
98 | assigned = np.digitize(x, bins) # bin each input in data. np.digitize will always return a valid index between 0 and num_bins-1 whenever bins has length (num_bins-1) to cater for the open range on both sides
99 | if len(orig_shape)>1 or orig_shape[-1]!=1: assigned = np.reshape(assigned, orig_shape)
100 | return assigned.astype(np.int)
101 |
102 |
103 |
104 | ######### Quantize data
105 | def quantize_logodds(x, bins, bin_reprs, return_probs=True):
106 | """
107 | Quantize logodds (x) using bin boundaries (bins) and reprs in logit space and then convert to prob space if `return_probs=True`.
108 |
109 | Parameters
110 | ----------
111 | x: numpy ndarray
112 | Network logits as numpy array
113 | bins: numpy ndarray
114 | Location of the (num_bins-1) bin boundaries
115 | bin_reprs: numpy ndarray
116 | Bin representations in logodds space. Contains (num_bins-1)=len(bins)+1 entries.
117 | return_probs: boolean (default: True)
118 | All operations take place in logodds space. Setting this to true will ensure that the values returned are in probability space (i.e. it will convert the quantized values from logodds to sigmoid before returning them)
119 |
120 | Returns
121 | -------
122 | quant_output: numpy ndarray
123 | The output of the quantization based on the bins and bin_reprs. Either the output will be in logodds space (i.e. return_probs=False) or in probability space.
124 | assigned: int numpy ndarray
125 | The bin assignment integers for each sample.
126 | """
127 | assigned = bin_data(x, bins) # log space
128 | quant_output = bin_reprs[assigned] # fill up representations based on assignments
129 | if return_probs: quant_output = utils.to_sigmoid(quant_output) # prob space
130 | return pred_probs, assigned
131 |
132 |
133 | ########### Bin boundary update
134 | def bin_boundary_update_closed_form(representations):
135 | """
136 | Closed form update of boundaries. stationary point when log(p(y=1|lambda)) - log(p(y=0|lambda)) = log(log(xxx)/log(xxx)) term. LHS side is logodds/boundaries when p(y|lambda) modelled with sigmoid (e.g. PPB )
137 | """
138 | temp_log = 1. + np.exp(-1*np.abs(representations))
139 | temp_log[temp_log==0] = utils.EPS
140 | logphi_a = np.maximum(0., representations) + np.log(temp_log)
141 | logphi_b = np.maximum(0., -1*representations) + np.log(temp_log)
142 | assert np.any(np.sign(logphi_a[1:]-logphi_a[:-1])*np.sign(logphi_b[:-1]-logphi_b[1:])>=0.)
143 | temp_log1 = np.abs( logphi_a[1:] - logphi_a[:-1] )
144 | temp_log2 = np.abs( logphi_b[:-1] - logphi_b[1:] )
145 | temp_log1[temp_log1==0] = utils.EPS
146 | temp_log2[temp_log2==0] = utils.EPS
147 | bin_boundaries = np.log(temp_log1) - np.log(temp_log2)
148 | bin_boundaries = np.sort(bin_boundaries)
149 | return bin_boundaries
150 |
151 |
152 |
153 |
154 | ######### Bin representation code
155 | def bin_representation_calculation(x, y, num_bins, bin_repr_scheme="sample_based", bin_boundaries=None, assigned=None, return_probs=False):
156 | """
157 | Bin representations: frequency based: num_positive_samples/num_total_samples in each bin.
158 | or pred_prob based: average of the sigmoid of lambda
159 | Function gets the bin representation which can be used during the MI maximization.
160 |
161 | Parameters
162 | ----------
163 | x: numpy ndarray
164 | logodds data which needs to be binned using bin_boundaries. Only needed if assigned not given.
165 | y: numpy ndarray
166 | Binary label for each sample
167 | bin_repr_scheme: strig
168 | scheme to use to determine bin reprs. options: 'sample_based' and 'pred_prob_based'
169 | bin_boundaries: numpy array
170 | logodds bin boundaries. Only needed when assigned is not given.
171 | assigned: int numpy array
172 | bin id assignments for each sample
173 |
174 | Returns
175 | -------
176 | quant_reprs: numpy array
177 | quantized bin reprs for each sample
178 |
179 | """
180 | assert (bin_boundaries is None) != (assigned is None), "Cant have or not have both arguments. Need exactly one of them."
181 | if assigned is None: assigned = bin_data(x, bin_boundaries)
182 |
183 | if bin_repr_scheme=="sample_based":
184 | quant_reprs = bin_repr_unknown_LLR(y, assigned, num_bins, return_probs) # frequency estimate of correct/incorrect
185 | elif bin_repr_scheme=="pred_prob_based":
186 | quant_reprs = bin_repr_unknown_LLR(utils.to_sigmoid(x), assigned, num_bins, return_probs) # softmax probability for bin reprs
187 | else:
188 | raise Exception("bin_repr_scheme=%s is not valid."%(bin_repr_scheme))
189 | return quant_reprs
190 |
191 | def bin_repr_unknown_LLR(sample_weights, assigned, num_bins, return_probs=False):
192 | """
193 | Unknown Bin reprs. Will take the average of the either the pred_probs or the binary labels.
194 | Determines the bin reprs by taking average of sample weights in each bin.
195 | For example for sample-based repr: sample_weights should be 0 or 1 indicating correctly classified or not.
196 | or for pred-probs-based repr: sample_weights should be the softmax output probabilities.
197 | Handles reshaping if sample_weights or assigned has more than 1 dim.
198 |
199 | Parameters
200 | ----------
201 | sample_weights: numpy ndarray
202 | array with the weight of each sample. These weights are used to calculate the bin representation by taking the averages of samples grouped together.
203 | assigned: int numpy array
204 | array with the bin ids of each sample
205 | return_probs: boolean (default: True)
206 | All operations take place in logodds space. Setting this to true will ensure that the values returned are in probability space (i.e. it will convert the quantized values from logodds to sigmoid before returning them)
207 |
208 | Returns
209 | -------
210 | representations: numpy ndarray
211 | representations of each sample based on the bin it was assigned to
212 | """
213 | orig_shape = sample_weights.shape
214 | assert np.all(orig_shape==assigned.shape)
215 | assert sample_weights.max()<=1.0 and sample_weights.min()>=0.0, "make sure sample weights are probabilities"
216 | if len(orig_shape)>1:
217 | sample_weights = sample_weights.flatten()
218 | assigned = assigned.flatten()
219 |
220 | bin_sums_pos = np.bincount(assigned, weights=sample_weights, minlength=num_bins) # sum up all positive samples
221 | counts = np.bincount(assigned, minlength=num_bins) # sum up all samples in bin
222 | filt = counts>0
223 | prob_pos = np.ones(num_bins)*sample_weights.mean() # NOTE: important change: when no samples at all fall into any bin then default should be the prior
224 | prob_pos[filt] = bin_sums_pos[filt] / counts[filt] # get safe prob of pos samples over all samples
225 | representations = prob_pos
226 | if return_probs==False: representations = utils.to_logodds( representations)#NOTE: converting to logit domain again
227 | return representations
228 |
229 | def bin_repr_known_LLR(bin_boundaries, prior_y_pos, distr_kde_dict):
230 | """
231 | Known Bin reprs (i.e. density based representation). Will get the bin representations based on the density estimated by KDE.
232 | Much slower than unknown LLR. so only used when calculating the MI.
233 |
234 | Parameters
235 | ----------
236 | logodds: numpy ndarray
237 | data which will be used to estimate the KDE
238 | y: numpy ndarray
239 | labels of the samples also used to get the positive and negative KDEs
240 | assigned: int numpy array
241 | array with the bin ids of each sample
242 | return_probs: boolean (default: True)
243 | All operations take place in logodds space. Setting this to true will ensure that the values returned are in probability space (i.e. it will convert the quantized values from logodds to sigmoid before returning them)
244 |
245 | Returns
246 | -------
247 | representations: numpy ndarray
248 | representations of each sample based on the bin it was assigned to
249 | """
250 | distr_pos = distr_kde_dict["pos"] # scipy.stats.gaussian_kde(logodds[y==1])
251 | distr_neg = distr_kde_dict["neg"] # scipy.stats.gaussian_kde(logodds[y==0])
252 | prior_y_neg = 1 - prior_y_pos
253 | new_boundaries = np.hstack([-100, bin_boundaries , 100])
254 | new_reprs = np.zeros(len(bin_boundaries)+1)
255 |
256 | p_ypos_given_lam = np.zeros( len(bin_boundaries)+1 )
257 | p_yneg_given_lam = np.zeros( len(bin_boundaries)+1 )
258 | for idx in range( len(bin_boundaries) + 1):
259 | numer = prior_y_pos*distr_pos.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=1)*p(y=1)
260 | denom = prior_y_neg*distr_neg.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=0)*p(y=0)
261 | new_reprs[idx] = utils.safe_log_diff(numer, denom, np.log)
262 | p_ypos_given_lam[idx] = numer
263 | p_yneg_given_lam[idx] = denom
264 | new_reprs[~np.isfinite(new_reprs)] = utils.EPS
265 | new_reprs[new_reprs==0] = utils.EPS
266 | return new_reprs, p_ypos_given_lam, p_yneg_given_lam
267 |
268 |
269 |
270 |
271 |
272 |
273 | def MI_unknown_LLR(p_y_pos, logodds, bin_boundaries, representations):
274 | """logodds => the logodds which were used to bin. rewrote MI loss: sum_Y sum_B p(y'|lambda)p(lambda) for term outside log. Before it was p(lambda|y')p(y') """
275 | # NOTE: checked and matches impl of Dan: -1*MI_eval(**kwargs) => all good
276 | pred_probs = utils.to_sigmoid(logodds)
277 | prior_y = io.AttrDict( dict(pos=p_y_pos, neg=1-p_y_pos) )
278 | num_bins = len(bin_boundaries)+1
279 | # get p(y|lambda)p(lambda).... first get mean pred. prob. per bin
280 | assigned = bin_data(logodds, bin_boundaries)
281 | bin_sums_pred_probs_pos = np.bincount( assigned, weights=pred_probs, minlength=num_bins) # get the reprs in prob space because of mean.
282 | p_y_pos_given_lambda_per_bin = bin_sums_pred_probs_pos / logodds.shape[0]
283 | bin_sums_pred_probs_neg = np.bincount( assigned, weights=1-pred_probs, minlength=num_bins) # get the reprs in prob space because of mean.
284 | p_y_neg_given_lambda_per_bin = bin_sums_pred_probs_neg / logodds.shape[0]
285 | p_y_given_lambda_dict = io.AttrDict(dict(pos=p_y_pos_given_lambda_per_bin, neg=p_y_neg_given_lambda_per_bin))
286 | mi_loss = 0.0
287 | for binary_class_str, binary_class in zip( ["neg","pos"], [0,1] ):
288 | terms_in_log = ( 1 + np.exp((1-2*binary_class) * representations) ) * prior_y[binary_class_str] # part 3
289 | bin_summation_term = np.sum( p_y_given_lambda_dict[binary_class_str] * np.log( terms_in_log ) )
290 | mi_loss += bin_summation_term
291 | return -1*mi_loss
292 |
293 |
294 |
295 |
296 | def MI_known_LLR(bin_boundaries, p_y_pos, distr_kde_dict):
297 | """
298 | Calculate the MI(lambda_hat, y)(using the known LLR), where lambda_hat is the quantized lambdas.
299 | This will compute the MI in bits (log2).
300 | It uses a KDE to estimate the density of the positive and negative samples.
301 | At the end it will perform some basic checks to see if the computations were correct.
302 | In addition to the MI it will compute the bit rate (R) (i.e. MI(z, lambda) where z is quantized lambda)
303 |
304 |
305 | Parameters
306 | ----------
307 | bin_boundaries: numpy array
308 | bin boundaries
309 | p_y_pos: float
310 | p(y=1) prior
311 | distr_kde_dict: dict
312 | dictionary containing the KDE objects used to estimate the density in each bin with keys 'pos' and 'neg'.
313 |
314 | Returns
315 | -------
316 | MI: float
317 | MI(z, y) where z is quantized lambda. This is the mutual information between the quantizer output to the label.
318 | R: float
319 | bin rate. This is MI(z, lambda). Mutual Information between lambda and quantized lambda.
320 | """
321 | distr_pos, distr_neg = distr_kde_dict["pos"], distr_kde_dict["neg"]
322 | p_y_neg = 1 - p_y_pos
323 |
324 |
325 | new_boundaries = np.hstack([-100, bin_boundaries, 100])
326 | # lists for checks afterwards
327 | all_vs, all_intpos, all_intneg = [], [], []
328 | MI, R = 0.0, 0.0
329 | for idx in range( len(bin_boundaries) + 1):
330 | integral_pos = p_y_pos*distr_pos.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=1)*p(y=1) = p(lam|y=1)
331 | integral_neg = p_y_neg*distr_neg.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=1)*p(y=1) = p(lam|y=0)
332 | repr = utils.safe_log_diff(integral_pos, integral_neg, np.log)
333 |
334 | p_ypos_given_z = max( utils.EPS, utils.to_sigmoid( repr) )
335 | p_yneg_given_z = max( utils.EPS, utils.to_sigmoid(-1*repr) )
336 |
337 | curr_MI_pos = integral_pos * ( utils.safe_log_diff( p_ypos_given_z, p_y_pos, np.log2 ) )
338 | curr_MI_neg = integral_neg * ( utils.safe_log_diff( p_yneg_given_z, p_y_neg, np.log2 ) )
339 | MI += curr_MI_pos + curr_MI_neg
340 |
341 | v = max( utils.EPS, (integral_pos + integral_neg) )
342 | curr_R = -1 * v * np.log2(v) # entropy of p(z) = p(z|y=1)p(y=1) + p(z|y=0)p(y=0)
343 | R += curr_R
344 | # gather for checks
345 | all_vs.append(v)
346 | all_intpos.append(integral_pos); all_intneg.append(integral_neg)
347 | np.testing.assert_almost_equal( np.sum(all_vs), 1.0 , decimal=1)
348 | np.testing.assert_almost_equal( np.sum(all_intpos), p_y_pos, decimal=1)
349 | np.testing.assert_almost_equal( np.sum(all_intneg), p_y_neg, decimal=1)
350 | return MI, R
351 |
352 |
353 | def MI_upper_bounds(p_y_pos, distr_kde_dict):
354 | """
355 | Calculate the MI upper bound of MI(z, y) <= MI(lambda, y). As z is the quantized version of lambda, MI(z, y) is upper bounded by MI(lambda, y).
356 | This is a tigther bound than H(y). This function will return both upper bounds.
357 |
358 | Bound 1: MI(z, y) <= H(y) - H(y|z) <= H(y)
359 | Bound 2: MI(z, y) <= MI(lambda, y)
360 |
361 | Parameters
362 | ----------
363 | p_y_pos: float
364 | p(y=1) prior
365 | distr_kde_dict: dict
366 | dictionary containing the KDE objects used to estimate the density in each bin with keys 'pos' and 'neg'.
367 |
368 | Returns
369 | -------
370 | H_y: float
371 | Loose upper bound which is H(y)
372 | MI_y_lambda: float
373 | Upper bound of MI(z, y) which is upper bounded by MI(lambda, y). Tigther bound than H(y)
374 |
375 | """
376 | tic = io.time.time()
377 | p_y_neg = 1 - p_y_pos
378 |
379 | # Bound 1
380 | H_y = -1*p_y_pos*np.log2(p_y_pos) + -1*p_y_neg*np.log2(p_y_neg)
381 |
382 | # Bound 2
383 | distr_pos, distr_neg = distr_kde_dict["pos"], distr_kde_dict["neg"]
384 | def get_logodd_lambda(lam):
385 | log_term_1 = p_y_pos * distr_pos.pdf(lam)
386 | log_term_2 = p_y_neg * distr_neg.pdf(lam)
387 | logodd_lambda = utils.safe_log_diff(log_term_1, log_term_2, np.log)
388 | return logodd_lambda
389 |
390 | def integral_pos(lam):
391 | logodd_lambda = get_logodd_lambda(lam)
392 | p_ypos_lambda = utils.to_sigmoid( logodd_lambda )
393 | return p_y_pos * distr_pos.pdf(lam) * utils.safe_log_diff( p_ypos_lambda, p_y_pos, np.log2) #np.log2( p_ypos_lambda / p_y_pos )
394 |
395 | def integral_neg(lam):
396 | logodd_lambda = get_logodd_lambda(lam)
397 | p_yneg_lambda = utils.to_sigmoid( -1* logodd_lambda )
398 | return p_y_neg * distr_neg.pdf(lam) * utils.safe_log_diff(p_yneg_lambda, p_y_neg, np.log2) #np.log2( p_yneg_lambda / p_y_neg )
399 |
400 | term_pos = integrate.quad(integral_pos, -100, 100, limit=100)[0]
401 | term_neg = integrate.quad(integral_neg, -100, 100, limit=100)[0]
402 | MI_y_lambda = term_pos + term_neg
403 |
404 |
405 | toc = io.time.time()
406 | print("Time elapsed: upper bound computation: ", (toc-tic), " seconds!")
407 | return H_y, MI_y_lambda
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
--------------------------------------------------------------------------------
/imax_calib/io.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
3 | # All rights reserved.
4 | ##
5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Affero General Public License as published
8 | # by the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Affero General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Affero General Public License
17 | # along with this program. If not, see .
18 | #
19 | # Author: Kanil Patel
20 | # -*- coding: utf-8 -*-
21 | """
22 | Created on Tue Mar 20 10:03:33 2018
23 |
24 | @author: pak2rng
25 | """
26 | import os
27 | import numpy as np
28 | from attrdict import AttrDict
29 | import deepdish
30 | import time
31 |
32 | def deepdish_read(fpath, group=None):
33 | ''' Read all data inside the hdf5 file '''
34 | data = deepdish.io.load(fpath, group=group)
35 | if isinstance(data, dict):
36 | data = AttrDict(data)
37 | return data
38 |
39 | def deepdish_write(fpath, data):
40 | ''' Save a dictionary as a hdf5 file! '''
41 | create_dir_for_fpath(fpath)
42 | deepdish.io.save(fpath, data, compression="None")
43 |
44 |
45 |
46 |
47 | class Logger:
48 | def __init__(self, fpath):
49 | self.fpath = fpath
50 | self.logdata = AttrDict({})
51 |
52 | def log(self, key, value):
53 | if key not in self.logdata: self.logdata[key] = []
54 | self.logdata[key].append(value)
55 |
56 | def last(self, key):
57 | return self.logdata[key][-1]
58 |
59 | def log_dict(self, dictionary, suffix=""):
60 | # logging each element in the dictionary
61 | suffix = "_%s"%(suffix) if (suffix != "" and suffix[0]!="_") else suffix
62 | for k,v in dictionary.items():
63 | self.log(k+suffix,v)
64 |
65 |
66 | def end_log(self):
67 | for k,v in self.logdata.items():
68 | self.logdata[k] = np.array(v) if isinstance(v, list) else v
69 |
70 | def save_log(self):
71 | deepdish_write(self.fpath, self.logdata)
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/imax_calib/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
3 | # All rights reserved.
4 | ###
5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021.
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Affero General Public License as published
8 | # by the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Affero General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Affero General Public License
17 | # along with this program. If not, see .
18 | #
19 | # Author: Kanil Patel
20 | # -*- coding: utf-8 -*-
21 | '''
22 | utils.py
23 | imax_calib
24 |
25 | Created by Kanil Patel on 07/06/20.
26 | Copyright 2020. Kanil Patel. All rights reserved.
27 | '''
28 | import numpy as np
29 | import sklearn.model_selection
30 |
31 | #EPS = np.finfo(float).eps # used to avoid division by zero
32 | EPS = 1e-50
33 |
34 | def is_sorted(a):
35 | for i in range(a.size-1):
36 | if a[i+1] < a[i] : return False
37 | return True
38 |
39 |
40 | def to_softmax(x, axis=-1):
41 | """
42 | Stable softmax in numpy. Will be applied across last dimension by default.
43 | Takes care of numerical instabilities like division by zero or log(0).
44 |
45 | Parameters
46 | ----------
47 | x : numpy ndarray
48 | Logits of the network as numpy array.
49 | axis: int
50 | Dimension along which to apply the operation (default: last one)
51 |
52 | Returns
53 | -------
54 | softmax: numpy ndarray
55 | Softmax output
56 | """
57 | z = x - np.max(x, axis=axis, keepdims=True)
58 | numerator = np.exp(z)
59 | denominator = np.sum(numerator, axis=axis, keepdims=True)
60 | softmax = numerator / denominator
61 | assert np.all( np.isfinite(softmax) ) == True , "Softmax output contains NaNs. Handle this."
62 | return softmax
63 |
64 | def to_sigmoid(x):
65 | """
66 | Stable sigmoid in numpy. Uses tanh for a more stable sigmoid function.
67 |
68 | Parameters
69 | ----------
70 | x : numpy ndarray
71 | Logits of the network as numpy array.
72 |
73 | Returns
74 | -------
75 | sigmoid: numpy ndarray
76 | Sigmoid output
77 | """
78 | sigmoid = 0.5 + 0.5 * np.tanh(x/2)
79 | assert np.all( np.isfinite(sigmoid) ) == True , "Sigmoid output contains NaNs. Handle this."
80 | return sigmoid
81 |
82 | def to_logodds(x):
83 | """
84 |
85 | Convert probabilities to logodds using:
86 |
87 | .. math::
88 | \\log\\frac{p}{1-p} ~ \\text{where} ~ p \\in [0,1]
89 |
90 | Natural log.
91 |
92 | Parameters
93 | ----------
94 | x : numpy ndarray
95 | Class probabilties as numpy array.
96 |
97 | Returns
98 | -------
99 | logodds : numpy ndarray
100 | Logodds output
101 |
102 | """
103 | x = np.clip(x, 1e-10, 1 - 1e-10)
104 | assert x.max() <= 1 and x.min() >= 0
105 | numerator = x
106 | denominator = 1-x
107 | #numerator[numerator==0] = EPS
108 | #denominator[denominator==0] = EPS # 1-EPS is basically 1 so not stable!
109 | logodds = safe_log_diff(numerator, denominator, np.log) # logodds = np.log( numerator/denominator )
110 | assert np.all(np.isfinite(logodds))==True, "Logodds output contains NaNs. Handle this."
111 | return logodds
112 |
113 | def safe_log_diff(A, B, log_func=np.log):
114 | """
115 | Numerically stable log difference function. Avoids log(0). Will compute log(A/B) safely where the log is determined by the log_func
116 | """
117 | if np.isscalar(A):
118 | if A==0 and B==0:
119 | return log_func(EPS)
120 | elif A==0:
121 | return log_func( EPS ) - log_func(B)
122 | elif B==0:
123 | return log_func( A ) - log_func( EPS )
124 | else:
125 | return log_func(A) - log_func(B)
126 | else:
127 | # log(A) - log(B)
128 | with np.errstate(divide='ignore'):
129 | output = np.where(A==0, log_func(EPS), log_func(A) ) - np.where(B==0, log_func(EPS), log_func(B))
130 | output[ np.logical_or(A==0, B==0)] = log_func(EPS)
131 | assert np.all(np.isfinite(output))
132 | return output
133 |
134 |
135 |
136 |
137 | def quick_logits_to_logodds(logits, probs=None):
138 | """
139 | Using the log-sum-exp trick can be slow to convert from logits to logodds. This function will use the faster prob_to_logodds if n_classes is large.
140 | """
141 | n_classes = logits.shape[-1]
142 | if n_classes <=100: # n_classes are reasonable as use this slow way to get marginal
143 | logodds = logits_to_logodds(logits)
144 | else: # imagenet case will always come here!
145 | if probs is None: probs = to_softmax(logits)
146 | logodds = probs_to_logodds(probs)
147 | return logodds
148 |
149 | def probs_to_logodds(x):
150 | """
151 | Use probabilities to convert to logodds. Faster than logits_to_logodds.
152 | """
153 | assert x.max() <= 1 and x.min() >= 0
154 | logodds = to_logodds(x)
155 | assert np.all(np.isfinite(logodds))
156 | return logodds
157 |
158 | def logits_to_logodds(x):
159 | """
160 | Convert network logits directly to logodds (without conversion to probabilities and then back to logodds) using:
161 |
162 | .. math::
163 | \\lambda_k=z_k-\\log\\sum\\nolimits_{k'\\not = k}e^{z_{k'}}
164 |
165 | Parameters
166 | ----------
167 | x: numpy ndarray
168 | Network logits as numpy array
169 |
170 | axis: int
171 | Dimension with classes
172 |
173 | Returns
174 | -------
175 | logodds : numpy ndarray
176 | Logodds output
177 | """
178 | n_classes = x.shape[1]
179 | all_logodds = []
180 | for class_id in range(n_classes):
181 | logodds_c = x[...,class_id][..., np.newaxis] - custom_logsumexp( np.delete(x, class_id, axis=-1) , axis=-1)
182 | all_logodds.append(logodds_c.reshape(-1))
183 | logodds = np.stack( all_logodds, axis=1 )
184 | assert np.all(np.isfinite(logodds))
185 | return logodds
186 |
187 | def custom_logsumexp(x, axis=-1):
188 | """
189 | Uses the log-sum-exp trick.
190 |
191 | Parameters
192 | ----------
193 | x: numpy ndarray
194 | Network logits as numpy array
195 |
196 | axis: int (default -1)
197 | axis along which to take the sum
198 |
199 | Returns
200 | -------
201 | out: numpy ndarray
202 | log-sum-exp of x along some axis
203 | """
204 | x_max = np.amax(x, axis=axis, keepdims=True)
205 | x_max[~np.isfinite(x_max)] = 0
206 | tmp = np.exp(x - x_max)
207 | s = np.sum(tmp, axis=axis, keepdims=True)
208 | s[s<=0] = EPS # only add epsilon when argument is zero
209 | out = np.log(s)
210 | out += x_max
211 | return out
212 |
213 |
214 |
215 |
216 |
217 |
218 | def to_onehot(y, num_classes):
219 | """
220 | Convert 1D targets to one-hot repr.
221 |
222 | Parameters
223 | ----------
224 | y: numpy 1D-array
225 | Array with sample target ids (i.e. 0 to -1)
226 | num_classes: int
227 | Number of classes
228 |
229 | Returns
230 | -------
231 | y_onehot: numpy ndarray
232 | One-hot representation of target labels
233 | """
234 | assert len(y.shape)==1
235 | y_onehot = np.eye(num_classes)[y]
236 | return y_onehot
237 |
238 |
239 | def binary_convertor(logodds, y, cal_setting, class_idx):
240 | """
241 | Function to convert the logodds data (in multi-class setting) to binary setting. The following options are available:
242 | 1) CW - slice out some class: cal_setting="CW" and class_idx is not None (int)
243 | 2) top1 - max class for each sample: get the top1 prediction: cal_setting="top1" and class_idx is None
244 | 3) sCW - merge marginal setting where data is combined: cal_setting="sCW" and class_idx is None
245 | """
246 |
247 | if cal_setting=="CW":
248 | assert class_idx is not None, "class_idx needs to be an integer to slice out class needed for CW calibration setting"
249 | logodds_c = logodds[..., class_idx]
250 | y_c = y[..., class_idx] if y is not None else None
251 | elif cal_setting=="top1":
252 | assert class_idx is None, "class_idx needs to be None - check"
253 | top1_indices = logodds.argmax(axis=-1)
254 | logodds_c = logodds[np.arange(top1_indices.shape[0]), top1_indices]
255 | y_c = y.argmax(axis=-1) == top1_indices if y is not None else None
256 | elif cal_setting=="sCW":
257 | assert class_idx is None, "class_idx needs to be None - check"
258 | logodds_c = np.concatenate(logodds.T)
259 | y_c = np.concatenate(y.T) if y is not None else None
260 | else:
261 | raise Exception("Calibration setting (%s) not recognized!"%(cal_setting))
262 |
263 | return logodds_c, y_c
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch import nn
4 | from imax_calib.evaluations import calibration_metrics as cal_metrics # Imax paper
5 | import calibration as cal # Kumar et al, Verified Uncertainty Calibration
6 |
7 | # Implements various metrics.
8 |
9 |
10 | def get_acc(y_pred, y_true):
11 | """ Computes the accuracy of predictions.
12 | If y_pred is 2D, it is assumed that it is a matrix of scores (e.g. probabilities) of shape (n_samples, n_classes)
13 | """
14 | if y_pred.ndim == 1:
15 | return np.mean(y_pred == y_true)
16 | elif y_pred.ndim == 2:
17 | return np.mean(np.argmax(y_pred, axis=1), y_true)
18 |
19 |
20 | def get_cw_ECE(probs, y_true, mode='mass', threshold_mode='class', num_bins=15):
21 | """ Estimates the class-wise ECE by binning.
22 |
23 | Args:
24 | probs: shape (n_samples, n_classes)
25 | y_true: shape (n_samples, )
26 | mode: Either 'mass' or 'width' -- determines binning scheme
27 | threshold_mode: Either 'class' or None -- determines if thresholding is used in estimation
28 | num_bins: Number of bins used in estimation
29 | """
30 |
31 | if mode == 'mass':
32 | _mode = 'mECE'
33 | elif mode == 'width':
34 | _mode = 'dECE'
35 |
36 | evals = cal_metrics.compute_top_1_and_CW_ECEs(probs, y_true, list_approximators=[_mode],
37 | num_bins=num_bins, threshold_mode=threshold_mode)
38 | return evals[f'cw_{_mode}']
39 |
40 |
41 | def get_ECE(probs, y_true, mode='mass', num_bins=15):
42 | """ Estimates the top-label ECE by binning.
43 |
44 | Args:
45 | probs: shape (n_samples, n_classes)
46 | y_true: shape (n_samples, )
47 | mode: Either 'mass' or 'width' -- determines binning scheme
48 | num_bins: Number of bins used in estimation
49 | """
50 | if mode == 'mass':
51 | _mode = 'mECE'
52 | elif mode == 'width':
53 | _mode = 'dECE'
54 |
55 | evals = cal_metrics.compute_top_1_and_CW_ECEs(probs, y_true, list_approximators=[_mode], num_bins=num_bins)
56 | return evals[f'top_1_{_mode}']
57 |
58 |
59 | def get_MCE(probs, y_true):
60 | """ Estimates the class-wise ECE. Not recommended for use.
61 | """
62 | return cal.get_calibration_error(probs, y_true,
63 | p=1, debias=False, mode='marginal')
64 |
65 |
66 | def get_NLL(probs, y_true):
67 | """ Computes the negative log likelihood.
68 | """
69 | nll = nn.NLLLoss()
70 | _probs = np.clip(probs, 1e-100, 1)
71 | logprobs = torch.from_numpy(np.log(_probs))
72 | return nll(logprobs, torch.from_numpy(y_true)).item()
73 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch import nn
3 | from sklearn.metrics import confusion_matrix
4 |
5 | # This file implements various utility functions.
6 |
7 |
8 | def get_human_labels_outcomes(human_counts, true_labels, seed=0):
9 | """ Converts from the counts to an ordered list of votes. Also computes the 0/1 Bernoulli outcomes.
10 | """
11 | rng = np.random.default_rng(seed)
12 |
13 | human_labels_per_input = np.sum(human_counts, axis=1)
14 | min_human_labels = int(min(human_labels_per_input))
15 | n_rows = human_counts.shape[0]
16 | n_classes = human_counts.shape[1]
17 |
18 | human_labels = np.empty(shape=(n_rows, min_human_labels))
19 | human_outcomes = np.empty(shape=(n_rows, min_human_labels))
20 | for row in range(n_rows):
21 | temp = []
22 | for i in range(n_classes):
23 | temp += [i] * int(human_counts[row, i])
24 | rng.shuffle(temp)
25 | human_labels[row, :] = temp[:min_human_labels]
26 | human_outcomes[row, :] = (human_labels[row, :] == true_labels[row]).astype(int)
27 |
28 | return human_labels, human_outcomes
29 |
30 |
31 | def simulate_single_human(human_counts, seed=0):
32 | rng = np.random.default_rng(seed)
33 |
34 | human_labels_per_input = np.sum(human_counts, axis=1)
35 | min_human_labels = int(min(human_labels_per_input))
36 | n_rows = human_counts.shape[0]
37 | n_classes = human_counts.shape[1]
38 |
39 | human_labels = np.empty(shape=(n_rows, min_human_labels))
40 | for row in range(n_rows):
41 | temp = []
42 | for i in range(n_classes):
43 | temp += [i] * int(human_counts[row, i])
44 | rng.shuffle(temp)
45 | human_labels[row, :] = temp[:min_human_labels]
46 |
47 | return human_labels[:, 0].astype(int)
48 |
49 |
50 | class SoftLogLoss(nn.Module):
51 | # Implements the "soft-log-loss" for use with the EM algorithm
52 | def __init__(self):
53 | super(SoftLogLoss, self).__init__()
54 |
55 | def forward(self, input, target):
56 | # input is tensor of model logits (n_samples, n_cls)
57 | # target is tensor of weight matrix (n_samples, n_cls)
58 | # c.f. https://github.com/pytorch/pytorch/issues/7455
59 | log_probs = nn.functional.log_softmax(input, dim=-1)
60 | loss = -1. * (log_probs * target).sum(dim=-1).mean()
61 | return loss
62 |
63 |
64 | def get_model_confidence_ratio(model_probs, y_true, h=None, y=None, y_h=None, mode='diff'):
65 | # args h / y : condition on Y = y and/or h(X) = h
66 | # arg mode: 'max' or 'diff' -- determines denominator
67 |
68 | if (h is None) and (y is None): # Unconditional
69 | idxs = [True] * y_true.size
70 | elif h is None: # Distribution conditioned on Y only
71 | idxs = (y_true == y)
72 | elif y is None: # Distribution conditioned on h only
73 | idxs = (y_h == h)
74 | else: # Distribution conditioned on y and h
75 | idxs = np.logical_and((y_true == y), (y_h == h))
76 |
77 | eps = 1e-16
78 | model_probs = model_probs.clip(eps, 1. - eps)
79 |
80 | n = y_true[idxs].size
81 | _model_probs = model_probs[idxs]
82 | _y_true = y_true[idxs]
83 |
84 | model_confidence_ratio = np.empty(n)
85 | for i in range(n):
86 | true_class_conf = _model_probs[i][y_true[i]]
87 | if mode == 'max':
88 | denom = np.max([conf for j, conf in enumerate(_model_probs[i]) if j != _y_true[i]])
89 | elif mode == 'diff':
90 | denom = 1. - true_class_conf
91 | model_confidence_ratio[i] = true_class_conf / denom
92 |
93 | return model_confidence_ratio
94 |
95 |
96 | def get_human_confidence_ratio(y_h_tr, y_true_tr, y_h_te, y_true_te, n_cls, h=None, y=None, mode='diff'):
97 | # Estimate human confusion matrix
98 | # Entry [i, j] is #(Y = i and h = j)
99 | conf_h = 1. * confusion_matrix(y_true_tr, y_h_tr, labels=np.arange(n_cls))
100 | # Swap so entry [i, j] is #(h = i and Y = j)
101 | conf_h = conf_h.T
102 | eps = 1e-50
103 | conf_h = np.clip(conf_h, eps, None)
104 | normalizer = np.sum(conf_h, axis=0, keepdims=True)
105 | # Normalize columns so entry [i, j] is P(h = i | Y = j)
106 | conf_h /= normalizer
107 |
108 | if (h is None) and (y is None): # Unconditional
109 | idxs = [True] * y_true_te.size
110 | elif h is None: # Distribution conditioned on Y only
111 | idxs = (y_true_te == y)
112 | elif y is None: # Distribution conditioned on h only
113 | idxs = (y_h_te == h)
114 | else: # Distribution conditioned on y and h
115 | return conf_h[h, y] / (1. - conf_h[h, y])
116 |
117 | n = y_true_te[idxs].size
118 | _y_true = y_true_te[idxs]
119 | human_confidence_ratio = np.empty(n)
120 | for i in range(n):
121 | true_class_conf = conf_h[y_h_te[i], _y_true[i]]
122 | if mode == 'max':
123 | denom = np.max([conf for j, conf in enumerate(conf_h[y_h_te[i], :]) if j != _y_true[i]])
124 | elif mode == 'diff':
125 | denom = 1. - true_class_conf
126 | human_confidence_ratio[i] = true_class_conf / denom
127 |
128 | return human_confidence_ratio
129 |
130 |
131 | def get_dirichlet_params(acc, strength, n_cls):
132 | # acc: desired off-diagonal accuracy
133 | # strength: strength of prior
134 |
135 | # Returns alpha,beta where the prior is Dir((beta, beta, . . . , alpha, . . . beta))
136 | # where the alpha appears for the correct class
137 |
138 | beta = 0.1
139 | alpha = beta * (n_cls - 1) * acc / (1. - acc)
140 |
141 | alpha *= strength
142 | beta *= strength
143 |
144 | alpha += 1
145 | beta += 1
146 |
147 | return alpha, beta
148 |
149 |
150 | def diversity(y1, y2, y_t):
151 | y1_outcomes = (y1 == y_t)
152 | y2_outcomes = (y2 == y_t)
153 |
154 | n = y_t.size
155 | both_correct = sum((y1_outcomes == 1) & (y2_outcomes == 1)) / n
156 | both_incorrect = sum((y1_outcomes == 0) & (y2_outcomes == 0)) / n
157 | y1c_y2w = sum((y1_outcomes == 1) & (y2_outcomes == 0)) / n
158 | y1w_y2c = sum((y1_outcomes == 0) & (y2_outcomes == 1)) / n
159 |
160 | return both_correct, both_incorrect, y1c_y2w, y1w_y2c
--------------------------------------------------------------------------------