├── README.md ├── calibrators.py ├── cifar10h ├── densenet-bc-L190-k40.csv ├── human_model_truth_cifar10h.csv ├── preresnet-110.csv └── resnet-110.csv ├── combination_methods.py ├── data_utils.py ├── ensemble_ts.py ├── experiments ├── ablation_experiment_v2.py ├── calibrate_combo_experiment.py ├── calibrate_first_experiment.py ├── calibration_experiment.py ├── calibration_method_experiment.py ├── em_experiment.py ├── semisup_em_experiment.py └── weighted_semisup_em_experiment.py ├── imax_calib ├── __init__.py ├── calibration.py ├── calibrators │ ├── __init__.py │ ├── binners.py │ └── scalers_np.py ├── clustering.py ├── evaluations │ ├── __init__.py │ └── calibration_metrics.py ├── hb_utils.py ├── io.py └── utils.py ├── metrics.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Combining Human Predictions with Model Probabilities via Confusion Matrices and Calibration 2 | 3 | This repo contains the code for our NeurIPS 2021 paper, Combining Human Predictions with Model Probabilities via Confusion Matrices and Calibration [[arxiv](https://arxiv.org/abs/2109.14591)]. 4 | 5 | The ImageNet-16H dataset is available on the [OSF](https://osf.io/2ntrf/?view_only=9ec9cacb806d4a1ea4e2f8acaada8f6c). Please also see our work [Bayesian Modeling of Human-AI Complementarity [Steyvers et al., 2022]](https://www.pnas.org/doi/10.1073/pnas.2111547119) describing this dataset in more detail. 6 | 7 | # Project Structure 8 | 9 | - Data for the CIFAR-10h experiments is contained in the `/cifar10h` directory. 10 | - `/experiments` contains various scripts for reproducing the experiments in our paper. 11 | - `calibrators.py` implements various calibration methods. 12 | - `combination_methods.py` implements various combination methods. 13 | - `data_utils.py` contains useful data processing methods. 14 | 15 | # References 16 | Please consider citing our paper as: 17 | ``` 18 | @inproceedings{kerrigan2021combining, 19 | title={Combining Human Predictions with Model Probabilities via Confusion Matrices and Calibration}, 20 | author={Kerrigan, Gavin and Smyth, Padhraic and Steyvers, Mark}, 21 | booktitle={Advances in Neural Information Processing Systems}, 22 | year={2021} 23 | } 24 | ``` 25 | -------------------------------------------------------------------------------- /calibrators.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import torch 4 | from torch import nn, optim 5 | from torch.distributions.log_normal import LogNormal 6 | 7 | import imax_calib.io as io 8 | import imax_calib.utils as imax_utils 9 | import imax_calib.calibration as imax_calibration 10 | 11 | import pyro 12 | import pyro.distributions as dist 13 | from pyro.infer import MCMC, NUTS 14 | from torch.nn.functional import softmax 15 | 16 | # This file implements various calibration methods. 17 | 18 | 19 | class BaseCalibrator: 20 | """ Abstract calibrator class 21 | """ 22 | def __init__(self): 23 | self.n_classes = None 24 | 25 | def fit(self, logits, y): 26 | raise NotImplementedError 27 | 28 | def calibrate(self, probs): 29 | raise NotImplementedError 30 | 31 | 32 | class IdentityCalibrator(BaseCalibrator): 33 | """ A class that implements no recalibration. 34 | """ 35 | 36 | def fit(self, probs, y): 37 | return 38 | 39 | def calibrate(self, probs): 40 | return probs 41 | 42 | 43 | class TSCalibrator(BaseCalibrator): 44 | """ Maximum likelihood temperature scaling (Guo et al., 2017) 45 | """ 46 | 47 | def __init__(self, temperature=1.): 48 | super().__init__() 49 | self.temperature = temperature 50 | 51 | self.loss_trace = None 52 | 53 | def fit(self, logits, y): 54 | """ Fits temperature scaling using hard labels. 55 | """ 56 | # Pre-processing 57 | self.n_classes = logits.shape[1] 58 | _model_logits = torch.from_numpy(logits) 59 | _y = torch.from_numpy(y) 60 | _temperature = torch.tensor(self.temperature, requires_grad=True) 61 | 62 | # Optimization parameters 63 | nll = nn.CrossEntropyLoss() # Supervised hard-label loss 64 | num_steps = 7500 65 | learning_rate = 0.05 66 | grad_tol = 1e-3 # Gradient tolerance for early stopping 67 | min_temp, max_temp = 1e-2, 1e4 # Upper / lower bounds on temperature 68 | 69 | optimizer = optim.Adam([_temperature], lr=learning_rate) 70 | 71 | loss_trace = [] # Track loss over iterations 72 | step = 0 73 | converged = False 74 | while not converged: 75 | 76 | optimizer.zero_grad() 77 | loss = nll(_model_logits / _temperature, _y) 78 | loss.backward() 79 | optimizer.step() 80 | loss_trace.append(loss.item()) 81 | 82 | with torch.no_grad(): 83 | _temperature.clamp_(min=min_temp, max=max_temp) 84 | 85 | step += 1 86 | if step > num_steps: 87 | warnings.warn('Maximum number of steps reached -- may not have converged (TS)') 88 | converged = (step > num_steps) or (np.abs(_temperature.grad) < grad_tol) 89 | 90 | self.loss_trace = loss_trace 91 | self.temperature = _temperature.item() 92 | 93 | def calibrate(self, probs): 94 | calibrated_probs = probs ** (1. / self.temperature) # Temper 95 | calibrated_probs /= np.sum(calibrated_probs, axis=1, keepdims=True) # Normalize 96 | return calibrated_probs 97 | 98 | 99 | class EnsembleTSCalibrator(BaseCalibrator): 100 | """ Ensemble Temperature Scaling (Zhang et al., 2020) 101 | This is just a thin wrapper around ensemble_ts.py for convenience. 102 | """ 103 | 104 | def __init__(self, temperature=1.): 105 | super().__init__() 106 | self.temperature = temperature 107 | self.weights = None 108 | 109 | def calibrate(self, probs): 110 | p1 = probs 111 | tempered_probs = probs ** (1. / self.temperature) # Temper 112 | tempered_probs /= np.sum(tempered_probs, axis=1, keepdims=True) # Normalize 113 | p0 = tempered_probs 114 | p2 = np.ones_like(p0) / self.n_classes 115 | 116 | calibrated_probs = self.weights[0] * p0 + self.weights[1] * p1 + self.weights[2] * p2 117 | 118 | return calibrated_probs 119 | 120 | def fit(self, logits, y): 121 | from ensemble_ts import ets_calibrate 122 | self.n_classes = logits.shape[1] 123 | 124 | # labels need to be one-hot for ETS 125 | _y = np.eye(self.n_classes)[y] 126 | 127 | t, w = ets_calibrate(logits, _y, self.n_classes, loss='mse') # loss = 'ce' 128 | self.temperature = t 129 | self.weights = w 130 | 131 | 132 | class TSCalibratorMAP(BaseCalibrator): 133 | """ MAP Temperature Scaling 134 | """ 135 | 136 | def __init__(self, temperature=1., prior_mu=0.5, prior_sigma=0.5): 137 | super().__init__() 138 | self.temperature = temperature 139 | self.loss_trace = None 140 | 141 | self.prior_mu = torch.tensor(prior_mu) 142 | self.prior_sigma = torch.tensor(prior_sigma) 143 | 144 | def fit(self, model_logits, y): 145 | """ Fits temperature scaling using hard labels. 146 | """ 147 | # Pre-processing 148 | _model_logits = torch.from_numpy(model_logits) 149 | _y = torch.from_numpy(y) 150 | _temperature = torch.tensor(self.temperature, requires_grad=True) 151 | 152 | prior = LogNormal(self.prior_mu, self.prior_sigma) 153 | # Optimization parameters 154 | nll = nn.CrossEntropyLoss() # Supervised hard-label loss 155 | num_steps = 7500 156 | learning_rate = 0.05 157 | grad_tol = 1e-3 # Gradient tolerance for early stopping 158 | min_temp, max_temp = 1e-2, 1e4 # Upper / lower bounds on temperature 159 | 160 | optimizer = optim.Adam([_temperature], lr=learning_rate) 161 | 162 | loss_trace = [] # Track loss over iterations 163 | step = 0 164 | converged = False 165 | while not converged: 166 | 167 | optimizer.zero_grad() 168 | loss = nll(_model_logits / _temperature, _y) 169 | loss += -1 * prior.log_prob(_temperature) # This step adds the prior 170 | loss.backward() 171 | optimizer.step() 172 | loss_trace.append(loss.item()) 173 | 174 | with torch.no_grad(): 175 | _temperature.clamp_(min=min_temp, max=max_temp) 176 | 177 | step += 1 178 | if step > num_steps: 179 | warnings.warn('Maximum number of steps reached -- may not have converged (TS)') 180 | converged = (step > num_steps) or (np.abs(_temperature.grad) < grad_tol) 181 | 182 | self.loss_trace = loss_trace 183 | self.temperature = _temperature.item() 184 | 185 | def calibrate(self, probs): 186 | calibrated_probs = probs ** (1. / self.temperature) # Temper 187 | calibrated_probs /= np.sum(calibrated_probs, axis=1, keepdims=True) # Normalize 188 | return calibrated_probs 189 | 190 | 191 | class IMaxCalibrator(BaseCalibrator): 192 | """ I-Max Binning calibration (Patel et al., 2021) 193 | https://arxiv.org/pdf/2006.13092.pdf 194 | """ 195 | 196 | def __init__(self, mode='CW', num_bins=15): 197 | super().__init__() 198 | # mode in ['cw', 'sCW', 'top1'] 199 | self.cfg = io.AttrDict(dict( 200 | # All 201 | cal_setting=mode, # CW, sCW or top1 # CW seems to be much better than sCW 202 | num_bins=num_bins, 203 | # Binning 204 | Q_method="imax", 205 | Q_binning_stage="raw", # bin the raw logodds or the 'scaled' logodds 206 | Q_binning_repr_scheme="sample_based", 207 | Q_bin_repr_during_optim="pred_prob_based", 208 | Q_rnd_seed=928163, 209 | Q_init_mode="kmeans" 210 | )) 211 | self.calibrator = None 212 | 213 | def calibrate(self, probs): 214 | logits = np.log(np.clip(probs, 1e-50, 1)) 215 | logodds = imax_utils.quick_logits_to_logodds(logits, probs=probs) 216 | cal_logits, cal_logodds, cal_probs, assigned = self.calibrator(logits, logodds) 217 | return cal_probs 218 | 219 | def fit(self, logits, y): 220 | n_samples, n_classes = logits.shape 221 | self.n_classes = n_classes 222 | self.cfg['n_classes'] = n_classes 223 | # y must be one-hot 224 | if y.ndim == 1: 225 | y_onehot = np.eye(self.n_classes)[y] 226 | else: 227 | y_onehot = y 228 | 229 | logodds = imax_utils.quick_logits_to_logodds(logits) 230 | self.calibrator = imax_calibration.learn_calibrator(self.cfg, 231 | logits=logits, 232 | logodds=logodds, 233 | y=y_onehot) 234 | 235 | 236 | class BayesianTemperingCalibrator: 237 | """ This class implements the fully Bayesian temperature scaling calibrator. 238 | """ 239 | 240 | def __init__(self, prior_params, num_classes, **kwargs): 241 | self.num_classes = num_classes 242 | # Inference parameters 243 | self.NUTS_params = {'adapt_step_size': kwargs.pop('adapt_step_size', True), 244 | 'target_accept_prob': kwargs.pop('target_accept_prob', 0.8), 245 | 'max_plate_nesting': 1 246 | } 247 | self.mcmc_params = {'num_samples': kwargs.pop('num_samples', 250), 248 | 'warmup_steps': kwargs.pop('num_warmup', 1000), 249 | 'num_chains': kwargs.pop('num_chains', 4) 250 | } 251 | 252 | # Prior parameters on beta / delta ; assumes each delta is iid 253 | self.prior_params = {'mu_beta': prior_params['mu_beta'], 254 | 'sigma_beta': prior_params['sigma_beta']} 255 | 256 | # Posterior parameters after ADF 257 | self.posterior_params = {'mu_beta': None, 258 | 'sigma_beta': None} 259 | 260 | # Drift parameters for sequential updating 261 | self.sigma_drift = kwargs.pop('sigma_drift', 0.0) 262 | 263 | # Tracking params 264 | # TODO: Prior/posterior trace 265 | self.timestep = 0 266 | self.mcmc = None # Contains the most recent Pyro MCMC api object 267 | self.verbose = kwargs.pop('verbose', False) 268 | 269 | if self.verbose: 270 | print('\nInitializing BT model:\n' 271 | '----| Prior: {} \n----| Inference Method: NUTS \n' 272 | '----| MCMC parameters: {}' 273 | ''.format(prior_params, self.mcmc_params)) 274 | 275 | def fit(self, logits, labels): 276 | """ Performs an update given new observations. 277 | 278 | Args: 279 | logits: tensor ; shape (batch_size, num_classes) 280 | labels: tensor ; shape (batch_size, ) 281 | """ 282 | assert len(labels.shape) == 1, 'Got label tensor with shape {} -- labels must be dense'.format(labels.shape) 283 | assert len(logits.shape) == 2, 'Got logit tensor with shape {}'.format(logits.shape) 284 | assert (labels.shape[0] == logits.shape[0]), 'Shape mismatch between logits ({}) and labels ({})' \ 285 | .format(logits.shape[0], labels.shape[0]) 286 | 287 | logits = logits.detach().clone().requires_grad_() 288 | labels = labels.detach().clone() 289 | 290 | batch_size = labels.shape[0] 291 | if self.verbose: 292 | print('----| Updating HBC model\n--------| Got a batch size of: {}'.format(batch_size)) 293 | 294 | self._update_prior_params() 295 | if self.verbose: 296 | print('--------| Updated priors: {}'.format(self.prior_params)) 297 | print('--------| Running inference ') 298 | nuts_kernel = NUTS(bt_model, **self.NUTS_params) 299 | self.mcmc = MCMC(nuts_kernel, **self.mcmc_params, disable_progbar=not self.verbose, 300 | mp_context='spawn') # Progbar if verbose 301 | self.mcmc.run(self.prior_params, logits, labels) 302 | 303 | self._update_posterior_params() 304 | self.timestep += 1 305 | 306 | return self.mcmc 307 | 308 | def _update_prior_params(self): 309 | """ Updates the prior parameters using the ADF posterior from the previous timestep, plus the drift. 310 | 311 | If this is the first batch, i.e. timestep == 0, do nothing. 312 | """ 313 | if self.timestep > 0: 314 | self.prior_params['mu_beta'] = self.posterior_params['mu_beta'] 315 | self.prior_params['sigma_beta'] = self.posterior_params['sigma_beta'] + self.sigma_drift 316 | 317 | def _update_posterior_params(self): 318 | """ Fits a normal distribution to the current beta samples using moment matching. 319 | """ 320 | beta_samples = self.get_current_posterior_samples() 321 | self.posterior_params['mu_beta'] = beta_samples.mean().item() 322 | self.posterior_params['sigma_beta'] = beta_samples.std().item() 323 | 324 | def get_current_posterior_samples(self): 325 | """ Returns the current posterior samples for beta. 326 | """ 327 | if self.mcmc is None: 328 | return None 329 | 330 | posterior_samples = self.mcmc.get_samples()['beta'] 331 | 332 | return posterior_samples 333 | 334 | def calibrate(self, probs): 335 | """ Calibrates the given batch of logits using the current posterior samples. 336 | 337 | Args: 338 | logit: tensor ; shape (batch_size, num_classes) 339 | """ 340 | # Get beta samples 341 | beta_samples = self.get_current_posterior_samples() # Shape (num_samples, num_classes) 342 | n_samples = beta_samples.size()[0] 343 | 344 | # Map betas to temperatures 345 | temperature_samples = torch.exp(beta_samples) # Shape (num_samples, ) 346 | 347 | # Shape (n_inputs, n_cls, n_mcmc_samples) 348 | tempered_probs = torch.empty((probs.shape[0], probs.shape[1], n_samples)) 349 | for i, temperature in enumerate(temperature_samples): 350 | tempered_probs[:, :, i] = probs ** (1. / temperature) 351 | tempered_probs[:, :, i] /= torch.sum(tempered_probs[:, :, i], dim=1, keepdim=True) 352 | 353 | # Average over the sampled probabilities to get Monte Carlo estimate 354 | calibrated_probs = tempered_probs.mean(dim=-1) # Shape (batch_size, num_classes) 355 | calibrated_probs = np.asarray(calibrated_probs) 356 | return calibrated_probs 357 | 358 | def get_MAP_temperature(self, logits, labels): 359 | """ Performs MAP estimation using the current prior and given data. 360 | NB: This should only be called after .update() if used in a sequential setting, as this method 361 | does not update the prior with sigma_drift. 362 | 363 | See: https://pyro.ai/examples/mle_map.html 364 | """ 365 | pyro.clear_param_store() 366 | svi = pyro.infer.SVI(model=bt_model, guide=MAP_guide, 367 | optim=pyro.optim.Adam({'lr': 0.001}), loss=pyro.infer.Trace_ELBO()) 368 | 369 | loss = [] 370 | num_steps = 5000 371 | for _ in range(num_steps): 372 | loss.append(svi.step(self.prior_params, logits, labels)) 373 | 374 | eps = 2e-2 375 | loss_sddev = np.std(loss[-25:]) 376 | if loss_sddev > eps: 377 | warnings.warn('MAP optimization may not have converged ; sddev {}'.format(loss_sddev)) 378 | print('Here is the last few loss terms for inspection: \n', loss[-50:]) 379 | 380 | MAP_temperature = torch.exp(pyro.param('beta_MAP')).item() 381 | return MAP_temperature 382 | 383 | 384 | def bt_model(prior_params, logits, labels): 385 | """ Helper function for fully Bayesian temperature scaling. 386 | """ 387 | n_obs = logits.shape[0] # Batch size 388 | 389 | # Prior over global temperature Beta ~ N( beta_mu, beta_sigma^2 ) 390 | prior_beta_mu = prior_params['mu_beta'] 391 | prior_beta_sigma = prior_params['sigma_beta'] 392 | beta = pyro.sample('beta', dist.Normal(prior_beta_mu, prior_beta_sigma)) # Shape (1, ) 393 | 394 | probs = softmax(torch.exp(-1. * beta) * logits, dim=1) # Shape (n_obs, n_classes) ; tempered probabilities 395 | 396 | # Observation plate ; vectorized 397 | with pyro.plate('obs', size=n_obs): 398 | a = pyro.sample('cat_obs', dist.Categorical(probs=probs), obs=labels) 399 | 400 | 401 | def MAP_guide(prior_params, logits, labels): 402 | """ Helper function for fully Bayesian temperature scaling. 403 | """ 404 | beta_MAP = pyro.param('beta_MAP', torch.tensor(1., requires_grad=True)) 405 | pyro.sample('beta', dist.Delta(beta_MAP)) 406 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from combination_methods import * 3 | import numpy as np 4 | import pandas as pd 5 | 6 | rng = np.random.default_rng(1234) 7 | 8 | 9 | def load_CIFAR10H(model_name): 10 | """ Loads the CIFAR-10H predictions (human and model) and true labels. 11 | """ 12 | dirname = os.path.dirname(__file__) 13 | if model_name == 'r_low_acc': 14 | data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv') 15 | data = np.genfromtxt(data_path, delimiter=',') 16 | 17 | human_counts = data[:, :10] 18 | model_probs = data[:, 10:20] 19 | true_labels = data[:, -1] 20 | 21 | true_labels -= 1 # data has labels 1-10 -- shifting so that they are zero-indexed. 22 | else: 23 | data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv') 24 | data = np.genfromtxt(data_path, delimiter=',') 25 | 26 | true_labels = data[:, 0] 27 | human_counts = data[:, 1:11] 28 | model_probs = data[:, 11:] 29 | 30 | true_labels = true_labels.astype(int) 31 | 32 | return human_counts, model_probs, true_labels 33 | 34 | 35 | def load_CIFAR10H_individual(model_name): 36 | """ Loads the CIFAR-10H predictions, but keeps track of the individual IDs. 37 | """ 38 | dirname = os.path.dirname(__file__) 39 | 40 | # ---- Model data 41 | if model_name == 'r_low_acc': 42 | data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv') 43 | data = np.genfromtxt(data_path, delimiter=',') 44 | model_probs = data[:, 10:20] 45 | else: 46 | data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv') 47 | data = np.genfromtxt(data_path, delimiter=',') 48 | model_probs = data[:, 11:] 49 | 50 | # ----- Human data 51 | # Load raw human-generated labels 52 | human_data_path = os.path.join(dirname, 'data/cifar10h/cifar10h-raw.csv') 53 | human_data = pd.read_csv(human_data_path) 54 | # drop attention checks 55 | human_data = human_data[human_data['is_attn_check'] == 0] 56 | # create dict which maps annotator_id --> [y_h, y_true, model_probs] 57 | # consisting of the individual's guesses, true labels, and model probabilities 58 | # (for the images labeled by the person) 59 | annotator_ids = human_data['annotator_id'].unique() 60 | n_annotators = annotator_ids.size 61 | individual_level_data = dict.fromkeys(np.arange(n_annotators)) 62 | for i, idx in enumerate(annotator_ids): 63 | single_annotator_data = human_data[human_data['annotator_id'] == idx] 64 | image_idxs = single_annotator_data['cifar10_test_test_idx'].to_numpy() 65 | model_probs_single_annotator = model_probs[image_idxs] 66 | 67 | individual_level_data[i] = {'y_h': single_annotator_data['chosen_label'].to_numpy(), 68 | 'model_probs': model_probs_single_annotator, 69 | 'y_m': np.argmax(model_probs_single_annotator, axis=1), 70 | 'y_true': single_annotator_data['true_label'].to_numpy(), 71 | 72 | } 73 | 74 | return individual_level_data 75 | 76 | 77 | def load_old_noisy_imagenet(noise_level, model_name, n_epochs=None, noise_type='phase', reaction_time=False): 78 | """ 79 | """ 80 | assert noise_type in ['phase'], 'Invalid noise type' 81 | assert noise_level in [80, 95, 110, 125], 'Invalid noise level' 82 | assert model_name in ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'], 'Invalid model name' 83 | 84 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair', 85 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck'] 86 | image_labels_numeric = np.arange(0, 16) 87 | 88 | data_dir = './data/old_noisy_imagenet_data' 89 | # data_dir = '../data/old_noisy_imagenet_data' 90 | human_data_fpath = data_dir + '/human_only_classification_6per_img_export.csv' 91 | model_data_prefix = '/imagenet_0016_category_phase_noise_all_predictions_' 92 | epoch_suffix = {None: 'baseline.csv', 93 | 0: 'epoch00.csv', 94 | 1: 'epoch01.csv', 95 | 10: 'epoch10.csv'} 96 | model_data_fpath = data_dir + model_data_prefix + epoch_suffix[n_epochs] 97 | 98 | human_data = pd.read_csv(human_data_fpath) 99 | human_data.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True) 100 | human_data = human_data[human_data['noise_level'] == noise_level] 101 | columns = ['participant_id', 'image_name', 'image_category', 'participant_classification'] 102 | if reaction_time: 103 | columns.append('classification_time') 104 | human_data = human_data[columns] 105 | if reaction_time: 106 | arbitrary_reaction_time_cutoff = 650 # 650 ms 107 | human_data = human_data[human_data.classification_time > arbitrary_reaction_time_cutoff] 108 | 109 | model_data = pd.read_csv(model_data_fpath) 110 | model_data.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True) 111 | model_data = model_data[(model_data['noise_type'] == noise_type) & 112 | (model_data['noise_level'] == noise_level) & 113 | (model_data['model_name'] == model_name)] 114 | model_data.drop(columns=['noise_type', 'noise_level', 'model_name', 'correct', 'category'], inplace=True) 115 | 116 | # Merge based on input image 117 | dataset = pd.merge(human_data, model_data, on='image_name', how='left') 118 | 119 | # Map to numpy 120 | human_ids = dataset['participant_id'].to_numpy(dtype=int) 121 | y_h = dataset['participant_classification'].to_numpy(dtype=int) 122 | y_true = dataset['image_category'].to_numpy(dtype=int) 123 | # Get model_probs in numeric order, i.e. model_probs[0][0] corresponds to 'airplane' 124 | model_probs = dataset[image_labels].to_numpy(dtype=float) 125 | # Normalize each model output (via summing) 126 | model_probs /= model_probs.sum(axis=1, keepdims=True) 127 | 128 | if reaction_time: 129 | reaction_time = dataset['classification_time'].to_numpy(dtype=float) 130 | return y_true, y_h, model_probs, reaction_time 131 | return y_true, y_h, model_probs 132 | 133 | 134 | def load_noisy_imagenet(model_name, noise_level, model_acc_level): 135 | model_acc_level = model_acc_level.lower() 136 | assert model_acc_level in ['low', 'med', 'high'] 137 | assert noise_level in [80, 95, 110, 125] 138 | if model_name == 'vgg19': 139 | model_name_dict = {'low': 'vgg19_01', 140 | 'med': 'vgg19_06', 141 | 'high': 'vgg19_48'} 142 | elif model_name == 'googlenet': 143 | model_name_dict = {'low': 'googlenet_01', 144 | 'med': 'googlenet_06', 145 | 'high': 'googlenet_47'} 146 | else: 147 | raise NotImplementedError 148 | 149 | # Read data CSVs 150 | data_path_model = f'./data/noisy_imagenet/{model_name}.csv' 151 | data_path_human = './data/noisy_imagenet/human_only_classification_6per_img_export.csv' 152 | data_model = pd.read_csv(data_path_model) 153 | data_human = pd.read_csv(data_path_human) 154 | 155 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair', 156 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck'] 157 | image_labels_numeric = np.arange(0, 16) 158 | 159 | # Replace string labels with numeric labels and get appropriate subset of data 160 | data_human.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True) 161 | data_human = data_human[data_human['noise_level'] == noise_level] 162 | data_human = data_human[['image_name', 'image_category', 'participant_classification']] 163 | 164 | # Get appropriate subset of model data 165 | data_model.rename(columns=dict(zip(image_labels, image_labels_numeric)), inplace=True) 166 | data_model = data_model[(data_model['noise_level'] == noise_level) & 167 | (data_model['model_name'] == model_name_dict[model_acc_level])] 168 | data_model = data_model[['image_name'] + image_labels_numeric.tolist()] 169 | 170 | dataset = pd.merge(data_human, data_model, how='left') 171 | dataset.drop(columns=['image_name'], inplace=True) 172 | 173 | y_true = dataset['image_category'].to_numpy().astype(int) 174 | y_h = dataset['participant_classification'].to_numpy().astype(int) 175 | model_probs = dataset[image_labels_numeric].to_numpy() 176 | 177 | return y_true, y_h, model_probs 178 | 179 | 180 | def load_noisy_imagenet_logits(noise_level, model_acc_level): 181 | model_acc_level = model_acc_level.lower() 182 | assert model_acc_level in ['low', 'med', 'high'] 183 | assert noise_level in [80, 95, 110, 125] 184 | model_name_dict = {'low': 'vgg19_01', 185 | 'med': 'vgg19_06', 186 | 'high': 'vgg19_48'} 187 | 188 | # Read data CSVs 189 | data_path_model = './data/noisy_imagenet/vgg19_logits.csv' 190 | data_path_human = './data/noisy_imagenet/human_only_classification_6per_img_export.csv' 191 | data_model = pd.read_csv(data_path_model) 192 | data_human = pd.read_csv(data_path_human) 193 | 194 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair', 195 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck'] 196 | image_labels_numeric = np.arange(0, 16) 197 | 198 | # Replace string labels with numeric labels and get appropriate subset of data 199 | data_human.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True) 200 | data_human = data_human[data_human['noise_level'] == noise_level] 201 | data_human = data_human[['image_name', 'image_category', 'participant_classification']] 202 | 203 | # Get appropriate subset of model data 204 | data_model.rename(columns=dict(zip(image_labels, image_labels_numeric)), inplace=True) 205 | data_model = data_model[(data_model['noise_level'] == noise_level) & 206 | (data_model['model_name'] == model_name_dict[model_acc_level])] 207 | data_model = data_model[['image_name'] + image_labels_numeric.tolist()] 208 | 209 | dataset = pd.merge(data_human, data_model, how='left') 210 | dataset.drop(columns=['image_name'], inplace=True) 211 | 212 | y_true = dataset['image_category'].to_numpy().astype(int) 213 | y_h = dataset['participant_classification'].to_numpy().astype(int) 214 | model_probs = dataset[image_labels_numeric].to_numpy() 215 | 216 | return y_true, y_h, model_probs 217 | 218 | 219 | def load_noisy_imagenet_individual(model_name, noise_level, model_acc_level): 220 | dirname = os.path.dirname(__file__) 221 | 222 | model_acc_level = model_acc_level.lower() 223 | assert model_acc_level in ['low', 'med', 'high'] 224 | assert noise_level in [80, 95, 110, 125] 225 | if model_name == 'vgg19': 226 | model_name_dict = {'low': 'vgg19_01', 227 | 'med': 'vgg19_06', 228 | 'high': 'vgg19_48'} 229 | elif model_name == 'googlenet': 230 | model_name_dict = {'low': 'googlenet_01', 231 | 'med': 'googlenet_06', 232 | 'high': 'googlenet_47'} 233 | else: 234 | raise NotImplementedError 235 | 236 | # Read data CSVs 237 | data_path_model = os.path.join(dirname, f'./data/noisy_imagenet/{model_name}.csv') 238 | data_path_human = os.path.join(dirname, './data/noisy_imagenet/human_only_classification_6per_img_export.csv') 239 | data_model = pd.read_csv(data_path_model) 240 | data_human = pd.read_csv(data_path_human) 241 | 242 | image_labels = ['airplane', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'car', 'cat', 'chair', 243 | 'clock', 'dog', 'elephant', 'keyboard', 'knife', 'oven', 'truck'] 244 | image_labels_numeric = np.arange(0, 16) 245 | 246 | # Replace string labels with numeric labels and get appropriate subset of data 247 | data_human.replace(to_replace=image_labels, value=image_labels_numeric, inplace=True) 248 | data_human = data_human[data_human['noise_level'] == noise_level] 249 | 250 | # Get appropriate subset of model data 251 | data_model.rename(columns=dict(zip(image_labels, image_labels_numeric)), inplace=True) 252 | data_model = data_model[(data_model['noise_level'] == noise_level) & 253 | (data_model['model_name'] == model_name_dict[model_acc_level])] 254 | data_model = data_model[['image_name'] + image_labels_numeric.tolist()] 255 | 256 | # ----- Human data 257 | # Load raw human-generated labels 258 | # create dict which maps annotator_id --> [y_h, y_true, model_probs] 259 | # consisting of the individual's guesses, true labels, and model probabilities 260 | # (for the images labeled by the person) 261 | annotator_ids = data_human['participant_id'].unique() 262 | n_annotators = annotator_ids.size 263 | individual_level_data = dict.fromkeys(np.arange(n_annotators)) 264 | for i, idx in enumerate(annotator_ids): 265 | single_annotator_data = data_human[data_human['participant_id'] == idx] 266 | image_idxs = single_annotator_data['image_name'].to_numpy() 267 | 268 | # Iterate over images human labeled, get model predictions 269 | model_probs_single_annotator = np.empty(shape=(image_idxs.size, 16)) 270 | for j, image_idx in enumerate(image_idxs): 271 | model_probs_single_image = data_model[data_model['image_name'] == image_idx] 272 | model_probs_single_image = model_probs_single_image[image_labels_numeric].to_numpy()[0] 273 | model_probs_single_annotator[j] = model_probs_single_image 274 | 275 | individual_level_data[i] = {'y_h': single_annotator_data['participant_classification'].to_numpy(), 276 | 'model_probs': model_probs_single_annotator, 277 | 'y_m': np.argmax(model_probs_single_annotator, axis=1), 278 | 'y_true': single_annotator_data['image_category'].to_numpy(), 279 | } 280 | 281 | return individual_level_data 282 | -------------------------------------------------------------------------------- /ensemble_ts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: Jize Zhang 5 | See : https://github.com/zhang64-llnl/Mix-n-Match-Calibration/blob/master/util_calibration.py 6 | """ 7 | 8 | import numpy as np 9 | from scipy import optimize 10 | from sklearn.isotonic import IsotonicRegression 11 | 12 | """ 13 | auxiliary functions for optimizing the temperature (scaling approaches) and weights of ensembles 14 | *args include logits and labels from the calibration dataset: 15 | """ 16 | 17 | 18 | def mse_t(t, *args): 19 | ## find optimal temperature with MSE loss function 20 | 21 | logit, label = args 22 | logit = logit / t 23 | n = np.sum(np.exp(logit), 1) 24 | p = np.exp(logit) / n[:, None] 25 | mse = np.mean((p - label) ** 2) 26 | return mse 27 | 28 | 29 | def ll_t(t, *args): 30 | ## find optimal temperature with Cross-Entropy loss function 31 | 32 | logit, label = args 33 | logit = logit / t 34 | n = np.sum(np.exp(logit), 1) 35 | p = np.clip(np.exp(logit) / n[:, None], 1e-20, 1 - 1e-20) 36 | N = p.shape[0] 37 | ce = -np.sum(label * np.log(p)) / N 38 | return ce 39 | 40 | 41 | def mse_w(w, *args): 42 | ## find optimal weight coefficients with MSE loss function 43 | 44 | p0, p1, p2, label = args 45 | p = w[0] * p0 + w[1] * p1 + w[2] * p2 46 | p = p / np.sum(p, 1)[:, None] 47 | mse = np.mean((p - label) ** 2) 48 | return mse 49 | 50 | 51 | def ll_w(w, *args): 52 | ## find optimal weight coefficients with Cros-Entropy loss function 53 | 54 | p0, p1, p2, label = args 55 | p = (w[0] * p0 + w[1] * p1 + w[2] * p2) 56 | N = p.shape[0] 57 | ce = -np.sum(label * np.log(p)) / N 58 | return ce 59 | 60 | 61 | ##### Ftting Temperature Scaling 62 | def temperature_scaling(logit, label, loss): 63 | bnds = ((0.05, 5.0),) 64 | if loss == 'ce': 65 | t = optimize.minimize(ll_t, 1.0, args=(logit, label), method='L-BFGS-B', bounds=bnds, tol=1e-12, 66 | options={'disp': False}) 67 | if loss == 'mse': 68 | t = optimize.minimize(mse_t, 1.0, args=(logit, label), method='L-BFGS-B', bounds=bnds, tol=1e-12, 69 | options={'disp': False}) 70 | t = t.x 71 | return t 72 | 73 | 74 | ##### Ftting Enseble Temperature Scaling 75 | def ensemble_scaling(logit, label, loss, t, n_class): 76 | p1 = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None] 77 | logit = logit / t 78 | p0 = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None] 79 | p2 = np.ones_like(p0) / n_class 80 | 81 | bnds_w = ((0.0, 1.0), (0.0, 1.0), (0.0, 1.0),) 82 | 83 | def my_constraint_fun(x): 84 | return np.sum(x) - 1 85 | 86 | constraints = {"type": "eq", "fun": my_constraint_fun, } 87 | if loss == 'ce': 88 | w = optimize.minimize(ll_w, (1.0, 0.0, 0.0), args=(p0, p1, p2, label), method='SLSQP', constraints=constraints, 89 | bounds=bnds_w, tol=1e-12, options={'disp': False}) 90 | if loss == 'mse': 91 | w = optimize.minimize(mse_w, (1.0, 0.0, 0.0), args=(p0, p1, p2, label), method='SLSQP', constraints=constraints, 92 | bounds=bnds_w, tol=1e-12, options={'disp': False}) 93 | w = w.x 94 | return w 95 | 96 | 97 | """ 98 | Calibration: 99 | Input: uncalibrated logits, temperature (and weight) 100 | Output: calibrated prediction probabilities 101 | """ 102 | 103 | 104 | ##### Calibration: Temperature Scaling with MSE 105 | def ts_calibrate(logit, label, logit_eval, loss): 106 | t = temperature_scaling(logit, label, loss) 107 | print("temperature = " + str(t)) 108 | logit_eval = logit_eval / t 109 | p = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None] 110 | return p 111 | 112 | 113 | ##### Calibration: Ensemble Temperature Scaling 114 | def ets_calibrate(logit, label, n_class, loss='mse'): 115 | t = temperature_scaling(logit, label, loss='mse') # loss can change to 'ce' 116 | #print("temperature = " + str(t)) 117 | w = ensemble_scaling(logit, label, 'mse', t, n_class) 118 | #print("weight = " + str(w)) 119 | 120 | return t, w 121 | 122 | """ 123 | p1 = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None] 124 | logit_eval = logit_eval / t 125 | p0 = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None] 126 | p2 = np.ones_like(p0) / n_class 127 | p = w[0] * p0 + w[1] * p1 + w[2] * p2 128 | return p 129 | """ 130 | 131 | ##### Calibration: Isotonic Regression (Multi-class) 132 | def mir_calibrate(logit, label, logit_eval): 133 | p = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None] 134 | p_eval = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None] 135 | ir = IsotonicRegression(out_of_bounds='clip') 136 | y_ = ir.fit_transform(p.flatten(), (label.flatten())) 137 | yt_ = ir.predict(p_eval.flatten()) 138 | 139 | p = yt_.reshape(logit_eval.shape) + 1e-9 * p_eval 140 | return p 141 | 142 | 143 | def irova_calibrate(logit, label, logit_eval): 144 | p = np.exp(logit) / np.sum(np.exp(logit), 1)[:, None] 145 | p_eval = np.exp(logit_eval) / np.sum(np.exp(logit_eval), 1)[:, None] 146 | 147 | for ii in range(p_eval.shape[1]): 148 | ir = IsotonicRegression(out_of_bounds='clip') 149 | y_ = ir.fit_transform(p[:, ii], label[:, ii]) 150 | p_eval[:, ii] = ir.predict(p_eval[:, ii]) + 1e-9 * p_eval[:, ii] 151 | return p_eval 152 | return p_eval -------------------------------------------------------------------------------- /experiments/ablation_experiment_v2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from utils import * 6 | from combination_methods import * 7 | from tqdm.auto import tqdm 8 | from sklearn.model_selection import train_test_split 9 | import calibration as cal 10 | import csv 11 | import numpy as np 12 | import os 13 | from imax_calib.evaluations import calibration_metrics as cal_metrics 14 | from scipy.special import softmax 15 | 16 | # note: this experiment does not appear in our paper and may contain outdated code. 17 | 18 | 19 | def get_cw_ECE(probs, y_true): 20 | evals = cal_metrics.compute_top_1_and_CW_ECEs(probs, y_true, list_approximators=['mECE']) 21 | return evals['cw_mECE'] 22 | 23 | 24 | def run_experiment_cifar10(out_fpath=None): 25 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on CIFAR10 26 | """ 27 | assert out_fpath is not None, 'Must specify output filepath' 28 | model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40'] 29 | test_size = 0.2 30 | n_runs = 25 31 | 32 | for model_name in tqdm(model_names, desc='Models', leave=True): 33 | output_file = out_fpath + f'{model_name}_ablation.csv' 34 | assert not os.path.exists(output_file), 'Output filepath already exists' 35 | # Create CSV output file, write header 36 | with open(output_file, 'a', newline='') as f: 37 | writer = csv.writer(f) 38 | writer.writerow(['trial', 'acc_h', 'acc_m', 39 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_nocal', 'acc_comb_doubleconf', 40 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 41 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_nocal', 'ce_combo_doubleconf']) 42 | 43 | human_counts, model_probs, y_true = load_CIFAR10H(model_name) 44 | y_h = simulate_single_human(human_counts) 45 | for i in tqdm(range(n_runs), leave=False, desc='Runs'): 46 | # Train/test split 47 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 48 | y_h, model_probs, y_true, test_size=test_size, random_state=i) 49 | 50 | # ----- Calibrator: temperature scaling 51 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling') 52 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr) 53 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te) 54 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te) 55 | 56 | # ----- Calibrator: ensemble temperature scaling 57 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling') 58 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr) 59 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te) 60 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te) 61 | 62 | # ----- Calibrator: None 63 | oracle_combiner_nocal = OracleCombiner(calibration_method=None) 64 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr) 65 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te) 66 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te) 67 | 68 | # ----- Only estimate model's confusion matrix 69 | double_conf_combiner = DoubleConfusionCombiner() 70 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 71 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te) 72 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te) 73 | 74 | # ----- Evaluate accuracies 75 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te) 76 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te) 77 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te) 78 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te) 79 | acc_h_te = np.mean(y_h_te == y_true_te) 80 | y_m_te = np.argmax(model_probs_te, axis=1) 81 | acc_m_te = np.mean(y_m_te == y_true_te) 82 | 83 | # ----- Evaluate calibration 84 | # NB: This is the marginal L1 CE (debiase) 85 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te, 86 | p=1, debias=False, mode='marginal') 87 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te, 88 | p=1, debias=False, mode='marginal') 89 | ce_m_ETS = cal.get_calibration_error(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te, 90 | p=1, debias=False, mode='marginal') 91 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te, 92 | p=1, debias=False, mode='marginal') 93 | ce_combo_ETS = cal.get_calibration_error(y_comb_prob_ETS, y_true_te, 94 | p=1, debias=False, mode='marginal') 95 | ce_combo_nocal = cal.get_calibration_error(y_comb_prob_nocal, y_true_te, 96 | p=1, debias=False, mode='marginal') 97 | ce_combo_doubleconf = cal.get_calibration_error(y_comb_prob_doubleconf, y_true_te, 98 | p=1, debias=False, mode='marginal') 99 | 100 | # Write results to CSV 101 | with open(output_file, 'a', newline='') as f: 102 | writer = csv.writer(f) 103 | writer.writerow([i, acc_h_te, acc_m_te, 104 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_oracle_nocal, acc_comb_doubleconf, 105 | ce_m_te, ce_m_TS, ce_m_ETS, 106 | ce_combo_TS, ce_combo_ETS, ce_combo_nocal, ce_combo_doubleconf]) 107 | 108 | 109 | def run_experiment_noisy_imagenet(out_fpath=None): 110 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet 111 | """ 112 | assert out_fpath is not None, 'Must specify output filepath' 113 | model_acc_levels = ['low', 'med', 'high'] 114 | noise_levels = [80, 95, 110, 125] 115 | test_size = 0.2 116 | n_runs = 25 117 | 118 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True): 119 | for noise_level in tqdm(noise_levels, desc='Noise Levels'): 120 | output_file = out_fpath + f'vgg19{model_level}_noise{noise_level}_ablation.csv' 121 | assert not os.path.exists(output_file), 'Output filepath already exists' 122 | # Create CSV output file, write header 123 | with open(output_file, 'a', newline='') as f: 124 | writer = csv.writer(f) 125 | writer.writerow(['trial', 'acc_h', 'acc_m', 126 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_imax_CW', 'acc_comb_imax_sCW', 127 | 'acc_comb_nocal', 'acc_comb_doubleconf', 128 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 'ce_m_imax_CW', 'ce_m_imax_sCW', 129 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_imax_CW', 'ce_combo_imax_sCW', 130 | 'ce_combo_nocal', 'ce_combo_doubleconf']) 131 | 132 | y_true, y_h, model_probs = load_noisy_imagenet(noise_level, model_level) 133 | for i in tqdm(range(n_runs), leave=False, desc='Runs'): 134 | # Train/test split 135 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 136 | y_h, model_probs, y_true, test_size=test_size, random_state=i) 137 | 138 | # ----- Calibrator: temperature scaling 139 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling') 140 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr) 141 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te) 142 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te) 143 | 144 | # ----- Calibrator: ensemble temperature scaling 145 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling') 146 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr) 147 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te) 148 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te) 149 | 150 | # ----- Calibrator: imax binning (CW) 151 | oracle_combiner_imax_CW = OracleCombiner(calibration_method='imax binning', mode='CW') 152 | oracle_combiner_imax_CW.fit(model_probs_tr, y_h_tr, y_true_tr) 153 | y_comb_prob_imax_CW = oracle_combiner_imax_CW.combine_proba(model_probs_te, y_h_te) 154 | y_comb_imax_CW = oracle_combiner_imax_CW.combine(model_probs_te, y_h_te) 155 | 156 | # ----- Calibrator: imax binning (sCW) 157 | oracle_combiner_imax_sCW = OracleCombiner(calibration_method='imax binning', mode='sCW') 158 | oracle_combiner_imax_sCW.fit(model_probs_tr, y_h_tr, y_true_tr) 159 | y_comb_prob_imax_sCW = oracle_combiner_imax_sCW.combine_proba(model_probs_te, y_h_te) 160 | y_comb_imax_sCW = oracle_combiner_imax_sCW.combine(model_probs_te, y_h_te) 161 | 162 | # ----- Calibrator: None 163 | oracle_combiner_nocal = OracleCombiner(calibration_method=None) 164 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr) 165 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te) 166 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te) 167 | 168 | # ----- Only estimate model's confusion matrix 169 | double_conf_combiner = DoubleConfusionCombiner() 170 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 171 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te) 172 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te) 173 | 174 | # ----- Evaluate accuracies 175 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te) 176 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te) 177 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te) 178 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te) 179 | acc_comb_imax_CW = np.mean(y_comb_imax_CW == y_true_te) 180 | acc_comb_imax_sCW = np.mean(y_comb_imax_sCW == y_true_te) 181 | acc_h_te = np.mean(y_h_te == y_true_te) 182 | y_m_te = np.argmax(model_probs_te, axis=1) 183 | acc_m_te = np.mean(y_m_te == y_true_te) 184 | 185 | # ----- Evaluate calibration 186 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te, 187 | p=1, debias=False, mode='marginal') 188 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te, 189 | p=1, debias=False, mode='marginal') 190 | ce_m_ETS = cal.get_calibration_error(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te, 191 | p=1, debias=False, mode='marginal') 192 | ce_m_imax_CW = cal.get_calibration_error(oracle_combiner_imax_CW.calibrate(model_probs_te), y_true_te, 193 | p=1, debias=False, mode='marginal') 194 | ce_m_imax_sCW = cal.get_calibration_error(oracle_combiner_imax_sCW.calibrate(model_probs_te), y_true_te, 195 | p=1, debias=False, mode='marginal') 196 | # ----- Of combination 197 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te, 198 | p=1, debias=False, mode='marginal') 199 | ce_combo_ETS = cal.get_calibration_error(y_comb_prob_ETS, y_true_te, 200 | p=1, debias=False, mode='marginal') 201 | ce_combo_nocal = cal.get_calibration_error(y_comb_prob_nocal, y_true_te, 202 | p=1, debias=False, mode='marginal') 203 | ce_combo_doubleconf = cal.get_calibration_error(y_comb_prob_doubleconf, y_true_te, 204 | p=1, debias=False, mode='marginal') 205 | ce_combo_imax_CW = cal.get_calibration_error(y_comb_prob_imax_CW, y_true_te, 206 | p=1, debias=False, mode='marginal') 207 | ce_combo_imax_sCW = cal.get_calibration_error(y_comb_prob_imax_sCW, y_true_te, 208 | p=1, debias=False, mode='marginal') 209 | 210 | # Write results to CSV 211 | with open(output_file, 'a', newline='') as f: 212 | writer = csv.writer(f) 213 | writer.writerow([i, acc_h_te, acc_m_te, 214 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_imax_CW, acc_comb_imax_sCW, 215 | acc_comb_oracle_nocal, acc_comb_doubleconf, 216 | ce_m_te, ce_m_TS, ce_m_ETS, ce_m_imax_CW, ce_m_imax_sCW, 217 | ce_combo_TS, ce_combo_ETS, ce_combo_imax_CW, ce_combo_imax_sCW, 218 | ce_combo_nocal, ce_combo_doubleconf]) 219 | 220 | 221 | def run_experiment_noisy_imagenet_logit(out_fpath=None): 222 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on Noisy ImageNet 223 | """ 224 | assert out_fpath is not None, 'Must specify output filepath' 225 | model_acc_levels = ['low', 'med', 'high'] 226 | noise_levels = [80, 95, 110, 125] 227 | test_size = 0.2 228 | n_runs = 25 229 | 230 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True): 231 | for noise_level in tqdm(noise_levels, desc='Noise Levels'): 232 | output_file = out_fpath + f'vgg19{model_level}_noise{noise_level}_ablation.csv' 233 | assert not os.path.exists(output_file), 'Output filepath already exists' 234 | # Create CSV output file, write header 235 | with open(output_file, 'a', newline='') as f: 236 | writer = csv.writer(f) 237 | writer.writerow(['trial', 'acc_h', 'acc_m', 238 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_imax_CW', 'acc_comb_imax_sCW', 239 | 'acc_comb_dirichlet', 'acc_comb_nocal', 'acc_comb_doubleconf', 240 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 'ce_m_imax_CW', 'ce_m_imax_sCW', 'ce_m_dirichlet', 241 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_imax_CW', 'ce_combo_imax_sCW', 242 | 'ce_combo_dirichlet', 'ce_combo_nocal', 'ce_combo_doubleconf']) 243 | 244 | y_true, y_h, model_logits = load_noisy_imagenet_logits(noise_level, model_level) 245 | model_probs = softmax(model_logits, axis=1) 246 | for i in tqdm(range(n_runs), leave=False, desc='Runs'): 247 | # Train/test split 248 | y_h_tr, y_h_te, model_logits_tr, model_logits_te, \ 249 | model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 250 | y_h, model_logits, model_probs, y_true, test_size=test_size, random_state=i) 251 | 252 | # ----- Calibrator: temperature scaling 253 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling') 254 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr) 255 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te) 256 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te) 257 | 258 | # ----- Calibrator: ensemble temperature scaling 259 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling') 260 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr) 261 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te) 262 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te) 263 | 264 | # ----- Calibrator: imax binning (CW) 265 | oracle_combiner_imax_CW = OracleCombiner(calibration_method='imax binning', mode='CW', num_bins=20) 266 | oracle_combiner_imax_CW.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr) 267 | y_comb_prob_imax_CW = oracle_combiner_imax_CW.combine_proba(model_probs_te, y_h_te) 268 | y_comb_imax_CW = oracle_combiner_imax_CW.combine(model_probs_te, y_h_te) 269 | 270 | # ----- Calibrator: imax binning (sCW) 271 | oracle_combiner_imax_sCW = OracleCombiner(calibration_method='imax binning', mode='sCW', num_bins=20) 272 | oracle_combiner_imax_sCW.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr) 273 | y_comb_prob_imax_sCW = oracle_combiner_imax_sCW.combine_proba(model_probs_te, y_h_te) 274 | y_comb_imax_sCW = oracle_combiner_imax_sCW.combine(model_probs_te, y_h_te) 275 | 276 | # ----- Calibrator: Dirichlet 277 | oracle_combiner_dirichlet = OracleCombiner(calibration_method='dirichlet') 278 | oracle_combiner_dirichlet.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr) 279 | y_comb_prob_dirichlet = oracle_combiner_dirichlet.combine_proba(model_probs_te, y_h_te) 280 | y_comb_dirichlet = oracle_combiner_dirichlet.combine(model_probs_te, y_h_te) 281 | 282 | # ----- Calibrator: None 283 | oracle_combiner_nocal = OracleCombiner(calibration_method=None) 284 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr, model_logits=model_logits_tr) 285 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te) 286 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te) 287 | 288 | # ----- Only estimate model's confusion matrix 289 | double_conf_combiner = DoubleConfusionCombiner() 290 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 291 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te) 292 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te) 293 | 294 | # ----- Evaluate accuracies 295 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te) 296 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te) 297 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te) 298 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te) 299 | acc_comb_imax_CW = np.mean(y_comb_imax_CW == y_true_te) 300 | acc_comb_imax_sCW = np.mean(y_comb_imax_sCW == y_true_te) 301 | acc_comb_dirichlet = np.mean(y_comb_dirichlet == y_true_te) 302 | acc_h_te = np.mean(y_h_te == y_true_te) 303 | y_m_te = np.argmax(model_probs_te, axis=1) 304 | acc_m_te = np.mean(y_m_te == y_true_te) 305 | 306 | # ----- Evaluate calibration 307 | # Model only 308 | ce_m = get_cw_ECE(model_probs_te, y_true_te) 309 | ce_m_TS = get_cw_ECE(oracle_combiner_TS.calibrate(model_probs_te), y_true_te) 310 | ce_m_ETS = get_cw_ECE(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te) 311 | ce_m_imax_CW = get_cw_ECE(oracle_combiner_imax_CW.calibrate(model_probs_te), y_true_te) 312 | ce_m_imax_sCW = get_cw_ECE(oracle_combiner_imax_sCW.calibrate(model_probs_te), y_true_te) 313 | ce_m_dirichlet = get_cw_ECE(oracle_combiner_dirichlet.calibrate(model_probs_te), y_true_te) 314 | 315 | # Combination 316 | ce_combo_doubleconf = get_cw_ECE(y_comb_prob_doubleconf, y_true_te) 317 | ce_combo_nocal = get_cw_ECE(y_comb_prob_nocal, y_true_te) 318 | ce_combo_TS = get_cw_ECE(y_comb_prob_TS, y_true_te) 319 | ce_combo_ETS = get_cw_ECE(y_comb_prob_ETS, y_true_te) 320 | ce_combo_imax_CW = get_cw_ECE(y_comb_prob_imax_CW, y_true_te) 321 | ce_combo_imax_sCW = get_cw_ECE(y_comb_prob_imax_sCW, y_true_te) 322 | ce_combo_dirichlet = get_cw_ECE(y_comb_prob_dirichlet, y_true_te) 323 | 324 | # Write results to CSV 325 | with open(output_file, 'a', newline='') as f: 326 | writer = csv.writer(f) 327 | writer.writerow([i, acc_h_te, acc_m_te, 328 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_imax_CW, acc_comb_imax_sCW, 329 | acc_comb_dirichlet, acc_comb_oracle_nocal, acc_comb_doubleconf, 330 | ce_m, ce_m_TS, ce_m_ETS, ce_m_imax_CW, ce_m_imax_sCW, ce_m_dirichlet, 331 | ce_combo_TS, ce_combo_ETS, ce_combo_imax_CW, ce_combo_imax_sCW, 332 | ce_combo_dirichlet, ce_combo_nocal, ce_combo_doubleconf]) 333 | 334 | 335 | def run_experiment_noisy_imagenet(out_fpath=None): 336 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet 337 | """ 338 | assert out_fpath is not None, 'Must specify output filepath' 339 | model_name = 'densenet161' 340 | model_acc_levels = [None, 0, 10] 341 | noise_levels = [80, 95, 110, 125] 342 | test_size = 0.2 343 | n_runs = 10 344 | 345 | for epochs in tqdm(model_acc_levels, desc='Models', leave=True): 346 | for noise_level in tqdm(noise_levels, desc='Noise Levels'): 347 | output_file = out_fpath + f'{model_name}_epoch{epochs}_noise{noise_level}_ablation.csv' 348 | # assert not os.path.exists(output_file), 'Output filepath already exists' 349 | # Create CSV output file, write header 350 | with open(output_file, 'a', newline='') as f: 351 | writer = csv.writer(f) 352 | writer.writerow(['trial', 'acc_h', 'acc_m', 353 | 'acc_comb_TS', 'acc_comb_ETS', 'acc_comb_imax_CW', 'acc_comb_imax_sCW', 354 | 'acc_comb_nocal', 'acc_comb_doubleconf', 355 | 'ce_m', 'ce_m_TS', 'ce_m_ETS', 'ce_m_imax_CW', 'ce_m_imax_sCW', 356 | 'ce_combo_TS', 'ce_combo_ETS', 'ce_combo_imax_CW', 'ce_combo_imax_sCW', 357 | 'ce_combo_nocal', 'ce_combo_doubleconf']) 358 | 359 | y_true, y_h, model_probs = load_old_noisy_imagenet_data(noise_level, model_name, n_epochs=epochs) 360 | for i in tqdm(range(n_runs), leave=False, desc='Runs'): 361 | # Train/test split 362 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 363 | y_h, model_probs, y_true, test_size=test_size, random_state=i) 364 | 365 | # ----- Calibrator: temperature scaling 366 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling') 367 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr) 368 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te) 369 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te) 370 | 371 | """ 372 | print('\n \n') 373 | print(f' Combo acc {np.mean(y_comb_TS == y_true_te)}') 374 | print(f' human tr {np.mean(y_h_tr == y_true_tr)}') 375 | print(f' human te {np.mean(y_h_te == y_true_te)}') 376 | print(f' model tr {np.mean(np.argmax(model_probs_tr, axis=1) == y_true_tr)}') 377 | print(f' model te {np.mean(np.argmax(model_probs_te, axis=1) == y_true_te)}') 378 | print(f' model all {np.mean(np.argmax(model_probs, axis=1) == y_true)}') 379 | quit() 380 | """ 381 | 382 | # ----- Calibrator: ensemble temperature scaling 383 | oracle_combiner_ETS = OracleCombiner(calibration_method='ensemble temperature scaling') 384 | oracle_combiner_ETS.fit(model_probs_tr, y_h_tr, y_true_tr) 385 | y_comb_prob_ETS = oracle_combiner_ETS.combine_proba(model_probs_te, y_h_te) 386 | y_comb_ETS = oracle_combiner_ETS.combine(model_probs_te, y_h_te) 387 | 388 | # ----- Calibrator: imax binning (CW) 389 | oracle_combiner_imax_CW = OracleCombiner(calibration_method='imax binning', mode='CW') 390 | oracle_combiner_imax_CW.fit(model_probs_tr, y_h_tr, y_true_tr) 391 | y_comb_prob_imax_CW = oracle_combiner_imax_CW.combine_proba(model_probs_te, y_h_te) 392 | y_comb_imax_CW = oracle_combiner_imax_CW.combine(model_probs_te, y_h_te) 393 | 394 | # ----- Calibrator: imax binning (sCW) 395 | oracle_combiner_imax_sCW = OracleCombiner(calibration_method='imax binning', mode='sCW') 396 | oracle_combiner_imax_sCW.fit(model_probs_tr, y_h_tr, y_true_tr) 397 | y_comb_prob_imax_sCW = oracle_combiner_imax_sCW.combine_proba(model_probs_te, y_h_te) 398 | y_comb_imax_sCW = oracle_combiner_imax_sCW.combine(model_probs_te, y_h_te) 399 | 400 | # ----- Calibrator: None 401 | oracle_combiner_nocal = OracleCombiner(calibration_method=None) 402 | oracle_combiner_nocal.fit(model_probs_tr, y_h_tr, y_true_tr) 403 | y_comb_prob_nocal = oracle_combiner_nocal.combine_proba(model_probs_te, y_h_te) 404 | y_comb_nocal = oracle_combiner_nocal.combine(model_probs_te, y_h_te) 405 | 406 | # ----- Only estimate model's confusion matrix 407 | double_conf_combiner = DoubleConfusionCombiner() 408 | double_conf_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 409 | y_comb_prob_doubleconf = double_conf_combiner.combine_proba(model_probs_te, y_h_te) 410 | y_comb_doubleconf = double_conf_combiner.combine(model_probs_te, y_h_te) 411 | 412 | # ----- Evaluate accuracies 413 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te) 414 | acc_comb_oracle_ETS = np.mean(y_comb_ETS == y_true_te) 415 | acc_comb_oracle_nocal = np.mean(y_comb_nocal == y_true_te) 416 | acc_comb_doubleconf = np.mean(y_comb_doubleconf == y_true_te) 417 | acc_comb_imax_CW = np.mean(y_comb_imax_CW == y_true_te) 418 | acc_comb_imax_sCW = np.mean(y_comb_imax_sCW == y_true_te) 419 | acc_h_te = np.mean(y_h_te == y_true_te) 420 | y_m_te = np.argmax(model_probs_te, axis=1) 421 | acc_m_te = np.mean(y_m_te == y_true_te) 422 | 423 | # ----- Evaluate calibration 424 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te, 425 | p=1, debias=False, mode='marginal') 426 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te, 427 | p=1, debias=False, mode='marginal') 428 | ce_m_ETS = cal.get_calibration_error(oracle_combiner_ETS.calibrate(model_probs_te), y_true_te, 429 | p=1, debias=False, mode='marginal') 430 | ce_m_imax_CW = cal.get_calibration_error(oracle_combiner_imax_CW.calibrate(model_probs_te), y_true_te, 431 | p=1, debias=False, mode='marginal') 432 | ce_m_imax_sCW = cal.get_calibration_error(oracle_combiner_imax_sCW.calibrate(model_probs_te), y_true_te, 433 | p=1, debias=False, mode='marginal') 434 | # ----- Of combination 435 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te, 436 | p=1, debias=False, mode='marginal') 437 | ce_combo_ETS = cal.get_calibration_error(y_comb_prob_ETS, y_true_te, 438 | p=1, debias=False, mode='marginal') 439 | ce_combo_nocal = cal.get_calibration_error(y_comb_prob_nocal, y_true_te, 440 | p=1, debias=False, mode='marginal') 441 | ce_combo_doubleconf = cal.get_calibration_error(y_comb_prob_doubleconf, y_true_te, 442 | p=1, debias=False, mode='marginal') 443 | ce_combo_imax_CW = cal.get_calibration_error(y_comb_prob_imax_CW, y_true_te, 444 | p=1, debias=False, mode='marginal') 445 | ce_combo_imax_sCW = cal.get_calibration_error(y_comb_prob_imax_sCW, y_true_te, 446 | p=1, debias=False, mode='marginal') 447 | 448 | # Write results to CSV 449 | with open(output_file, 'a', newline='') as f: 450 | writer = csv.writer(f) 451 | writer.writerow([i, acc_h_te, acc_m_te, 452 | acc_comb_oracle_TS, acc_comb_oracle_ETS, acc_comb_imax_CW, acc_comb_imax_sCW, 453 | acc_comb_oracle_nocal, acc_comb_doubleconf, 454 | ce_m_te, ce_m_TS, ce_m_ETS, ce_m_imax_CW, ce_m_imax_sCW, 455 | ce_combo_TS, ce_combo_ETS, ce_combo_imax_CW, ce_combo_imax_sCW, 456 | ce_combo_nocal, ce_combo_doubleconf]) 457 | 458 | 459 | if __name__ == '__main__': 460 | """ 461 | out_fpath = './output/cifar10h/' 462 | run_experiment_cifar10(out_fpath) 463 | """ 464 | 465 | out_fpath = './output/' 466 | run_experiment_noisy_imagenet(out_fpath) 467 | -------------------------------------------------------------------------------- /experiments/calibrate_combo_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from utils import * 6 | from combination_methods import * 7 | from tqdm.auto import tqdm 8 | import torch 9 | from sklearn.model_selection import train_test_split 10 | from metrics import * 11 | import csv 12 | import numpy as np 13 | import os 14 | from calibrators import * 15 | 16 | # Generates the data for Table 2 (and Appendix D) in our paper. 17 | 18 | 19 | def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs): 20 | seed = kwargs.pop('seed', 0) 21 | n_runs = kwargs.pop('n_runs', 25) 22 | test_size = kwargs.pop('test_size', 0.3) 23 | calibration_methods = kwargs.pop('calibration_methods', ['none']) 24 | calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE}) 25 | output_file_acc = kwargs.pop('output_file_acc', './acc.csv') 26 | output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv') 27 | 28 | acc_data = [] 29 | cal_data = [] 30 | for i in tqdm(range(n_runs), leave=False, desc='Runs'): 31 | # Train/test split 32 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 33 | y_h, model_probs, y_true, test_size=test_size, random_state=i * seed) 34 | 35 | # Limit to 5k datapoints 36 | y_h_tr = y_h_tr[:5000] 37 | model_probs_tr = model_probs_tr[:5000, :] 38 | y_true_tr = y_true_tr[:5000] 39 | 40 | acc_h = get_acc(y_h_te, y_true_te) 41 | acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te) 42 | 43 | _acc_data = [acc_h, acc_m] 44 | _cal_data = [] 45 | DIAG_ACC = 0.75 46 | MU_BETA = 0.5 47 | SIGMA_BETA = 0.5 48 | combiners = {'MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA), 49 | 'uncal_MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA)} 50 | for combiner_name, combiner in combiners.items(): 51 | combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 52 | if combiner_name == 'uncal_MAP_CI': 53 | combiner.calibrator.temperature = 1 # pretty hacky way to get uncalibrated temps.. but w/e 54 | 55 | y_comb_te = combiner.combine(model_probs_te, y_h_te) 56 | acc_comb = get_acc(y_comb_te, y_true_te) 57 | _acc_data.append(acc_comb) 58 | 59 | model_probs_calibrated_te = combiner.calibrate(model_probs_te) 60 | y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te) 61 | 62 | # ----- Calibrate combination 63 | ts_calibrator = TSCalibratorMAP() 64 | comb_probs_tr = combiner.combine_proba(model_probs_tr, y_h_tr) 65 | comb_logits_tr = np.log(np.clip(comb_probs_tr, 1e-50, 1)) 66 | ts_calibrator.fit(comb_logits_tr, y_true_tr) 67 | y_comb_prob_te_calibrated = ts_calibrator.calibrate(y_comb_prob_te) 68 | 69 | for metric, fxn in calibration_metrics.items(): 70 | cal_m = fxn(model_probs_calibrated_te, y_true_te) 71 | cal_comb = fxn(y_comb_prob_te, y_true_te) 72 | cal_comb_calibrated = fxn(y_comb_prob_te_calibrated, y_true_te) 73 | _cal_data.append([combiner_name, metric, cal_m, cal_comb, cal_comb_calibrated]) 74 | 75 | acc_data += [_acc_data] 76 | cal_data += _cal_data 77 | 78 | # Save data to CSV 79 | header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods] 80 | with open(output_file_acc, 'w', newline='') as f: 81 | writer = csv.writer(f) 82 | writer.writerow(header_acc) 83 | writer.writerows(acc_data) 84 | header_cal = ['calibration method', 'metric', 'model', 'comb', 'comb (post cal)'] 85 | with open(output_file_calibration, 'w', newline='') as f: 86 | writer = csv.writer(f) 87 | writer.writerow(header_cal) 88 | writer.writerows(cal_data) 89 | 90 | 91 | def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0): 92 | model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40'] 93 | for model_name in tqdm(model_names, desc='Models', leave=True): 94 | # Specify output files 95 | output_file_acc = out_fpath + f'{model_name}_accuracy.csv' 96 | output_file_calibration = out_fpath + f'{model_name}_calibration.csv' 97 | assert not os.path.exists(output_file_acc), 'Output filepath already exists' 98 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists' 99 | experiment_args['output_file_acc'] = output_file_acc 100 | experiment_args['output_file_calibration'] = output_file_calibration 101 | 102 | # Load data 103 | human_counts, model_probs, y_true = load_CIFAR10H(model_name) 104 | y_h = simulate_single_human(human_counts, seed=seed) 105 | 106 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args) 107 | 108 | 109 | def run_experiment_noisy_imagenet(out_fpath=None, experiment_args=None, seed=0): 110 | model_acc_levels = ['high'] # ['low', 'med', 'high'] 111 | noise_levels = [80, 95, 110, 125] 112 | model_names = ['vgg19', 'googlenet'] 113 | 114 | for model_name in model_names: 115 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True): 116 | for noise_level in tqdm(noise_levels, desc='Noise Levels'): 117 | # Specify output files 118 | output_file_acc = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_accuracy.csv' 119 | output_file_calibration = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_calibration.csv' 120 | assert not os.path.exists(output_file_acc), 'Output filepath already exists' 121 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists' 122 | experiment_args['output_file_acc'] = output_file_acc 123 | experiment_args['output_file_calibration'] = output_file_calibration 124 | 125 | # Load data 126 | y_true, y_h, model_probs = load_noisy_imagenet(model_name, noise_level, model_level) 127 | 128 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args) 129 | 130 | 131 | if __name__ == '__main__': 132 | seed = 9658 133 | torch.manual_seed(seed) 134 | np.random.seed(seed) 135 | 136 | calibration_methods = ['none', 'confusion', 'temperature scaling'] 137 | """ 138 | calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'), 139 | 'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'), 140 | 'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'), 141 | 'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'), 142 | 'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width', 143 | threshold_mode=None), 144 | 'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass', 145 | threshold_mode=None), 146 | 'kumar MCE': get_MCE, 147 | 'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y, 148 | p=1, debias=False, mode='marginal'), 149 | 'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y, 150 | p=1, debias=False, 151 | mode='marginal'), 152 | 'kumar ECE': cal.get_ece} 153 | """ 154 | calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'), 155 | 'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'), 156 | 'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'), 157 | 'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'), 158 | 'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None), 159 | 'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None), 160 | 'NLL': get_NLL} 161 | 162 | args = {'n_runs': 25, 163 | 'test_size': 0.3, 164 | 'calibration_methods': calibration_methods, 165 | 'calibration_metrics': calibration_metrics, 166 | 'seed': seed 167 | } 168 | 169 | #out_fpath = './output/cifar10h/final/calibrate_comb_MAP/' 170 | #run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed) 171 | 172 | out_fpath = './output/noisy_imagenet/final/calibrate_comb_MAP/' 173 | run_experiment_noisy_imagenet(out_fpath=out_fpath, experiment_args=args, seed=seed) 174 | -------------------------------------------------------------------------------- /experiments/calibrate_first_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from combination_methods import * 6 | from tqdm.auto import tqdm 7 | from sklearn.model_selection import train_test_split 8 | import calibration as cal 9 | import csv 10 | import os.path 11 | 12 | # note: this experiment does not appear in our paper. 13 | 14 | 15 | def run_experiment(out_fpath=None): 16 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy imagenet 17 | 18 | Done in a semi-supervised fashion. 19 | """ 20 | assert out_fpath is not None, 'Must specify output filepath' 21 | # Create CSV output file if needed, write header 22 | with open(out_fpath, 'a', newline='') as f: 23 | writer = csv.writer(f) 24 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 'n_l', 'n_u', 25 | 'acc_h_te', 'acc_m_te', 26 | 'acc_comb_oracle_te', 'acc_comb_unsup_te', 'acc_comb_semisup_te', 27 | 'acc_comb_calibfirst_te', 28 | 'ece_m_te', 'ece_m_calibrated_oracle_te', 'ece_m_calibrated_unsup_te', 29 | 'ece_m_calibrated_semisup_te', 'ece_m_calibrated_calibfirst_te', 30 | 'ece_combo_oracle_te', 'ece_combo_unsup_te', 'ece_combo_semisup_te', 31 | 'ece_combo_calibfirst_te']) 32 | 33 | # Experiment parameters 34 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'] 35 | model_names = ['densenet161'] 36 | noise_levels = [80, 95, 110, 125] 37 | # epochs = [None, 0, 1, 10] 38 | epochs = [None, 0, 10] 39 | noise_type = 'phase' 40 | n_runs = 10 41 | 42 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True): 43 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'): 44 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'): 45 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet_data(noise_level, model_name, 46 | n_epochs=epoch, noise_type=noise_type) 47 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'): 48 | # Train/test split 70/30 49 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 50 | y_h, model_probs, y_true, test_size=0.3, random_state=i) 51 | 52 | n_tr = y_h_tr.size # Number of training points 53 | 54 | # Evaluate accuracies of things that don't change with n_l 55 | acc_h_te = np.mean(y_h_te == y_true_te) 56 | y_m_te = np.argmax(model_probs_te, axis=1) 57 | acc_m_te = np.mean(y_m_te == y_true_te) 58 | # Evaluate calibration of things that don't change with n_l 59 | ece_m_te = cal.get_ece(model_probs_te, y_true_te) 60 | 61 | # ----- Unsupervised EM, all unlabeled data 62 | # Edge case with n_l = 0 63 | unsupervised_EM_combiner = UnsupervisedEMCombiner() 64 | unsupervised_EM_combiner.fit(model_probs_tr, y_h_tr) 65 | y_comb_prob_unsup_te = unsupervised_EM_combiner.combine_proba(model_probs_te, y_h_te) 66 | y_comb_unsup_te = unsupervised_EM_combiner.combine(model_probs_te, y_h_te) 67 | 68 | acc_combo_unsup_all_te = np.mean(y_comb_unsup_te == y_true_te) 69 | ece_m_calibrated_unsup_all_te = cal.get_ece(unsupervised_EM_combiner.calibrate(model_probs_te), 70 | y_true_te) 71 | ece_combo_unsup_all_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te) 72 | 73 | with open(out_fpath, 'a', newline='') as f: 74 | writer = csv.writer(f) 75 | writer.writerow([model_name, noise_level, epoch, i, 0, n_tr, 76 | acc_h_te, acc_m_te, 77 | None, acc_combo_unsup_all_te, None, 78 | ece_m_te, None, ece_m_calibrated_unsup_all_te, 79 | None, 80 | None, ece_combo_unsup_all_te, None]) 81 | 82 | # ----- Fully supervised (oracle) combo 83 | # Edge case with n_l = all 84 | oracle_combiner = OracleCombiner() 85 | oracle_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 86 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te) 87 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te) 88 | 89 | acc_combo_oracle_all_te = np.mean(y_comb_oracle_te == y_true_te) 90 | ece_m_calibrated_oracle_all_te = cal.get_ece(oracle_combiner.calibrate(model_probs_te), 91 | y_true_te) 92 | ece_combo_oracle_all_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te) 93 | 94 | with open(out_fpath, 'a', newline='') as f: 95 | writer = csv.writer(f) 96 | writer.writerow([model_name, noise_level, epoch, i, n_tr, 0, 97 | acc_h_te, acc_m_te, 98 | None, acc_combo_oracle_all_te, None, 99 | ece_m_te, None, ece_m_calibrated_oracle_all_te, 100 | None, 101 | None, ece_combo_oracle_all_te, None]) 102 | 103 | # TODO : Do these edge cases really need to be separated out?? 104 | 105 | n_l_sizes = [10, 50, 100, 250, 500, 1000, 2500, 4500] # Amount of labeled data to use 106 | for n_l in tqdm(n_l_sizes, leave=False, desc='Num. Labels'): 107 | # Split into labeled / unlabeled datasets 108 | n_u = n_tr - n_l 109 | y_h_tr_u, y_h_tr_l = y_h_tr[n_l:], y_h_tr[:n_l] 110 | model_probs_tr_u, model_probs_tr_l = model_probs_tr[n_l:], model_probs_tr[:n_l] 111 | y_true_tr_l = y_true_tr[:n_l] 112 | 113 | # ----- Labeled data only 114 | oracle_combiner = OracleCombiner() 115 | oracle_combiner.fit(model_probs_tr_l, y_h_tr_l, y_true_tr_l) 116 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te) 117 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te) 118 | model_probs_calibrated_oracle_te = oracle_combiner.calibrate(model_probs_te) 119 | 120 | # ----- Semi-Supervised EM 121 | # Fit parameters, calibrate test set, combine test set 122 | semisup_combiner = SemiSupervisedEMCombiner() 123 | semisup_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l) 124 | y_comb_prob_semisup_te = semisup_combiner.combine_proba(model_probs_te, y_h_te) 125 | y_comb_semisup_te = semisup_combiner.combine(model_probs_te, y_h_te) 126 | model_probs_calibrated_semisup_te = semisup_combiner.calibrate(model_probs_te) 127 | 128 | # ----- Unsupervised EM 129 | unsup_combiner = UnsupervisedEMCombiner() 130 | unsup_combiner.fit(model_probs_tr_u, y_h_tr_u) 131 | y_comb_prob_unsup_te = unsup_combiner.combine_proba(model_probs_te, y_h_te) 132 | y_comb_unsup_te = unsup_combiner.combine(model_probs_te, y_h_te) 133 | model_probs_calibrated_unsup_te = unsup_combiner.calibrate(model_probs_te) 134 | 135 | # ----- Calibrate first, then fit confusion using semi-supervised EM 136 | calibrate_first_combiner = CalibrateFirstCombiner() 137 | calibrate_first_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l) 138 | y_comb_prob_calibfirst_te = calibrate_first_combiner.combine_proba(model_probs_te, y_h_te) 139 | y_comb_calibfirst_te = calibrate_first_combiner.combine(model_probs_te, y_h_te) 140 | model_probs_calibrated_calibfirst_te = calibrate_first_combiner.calibrate(model_probs_te) 141 | 142 | # ----- Evaluate accuracies 143 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te) 144 | acc_comb_semisup_te = np.mean(y_comb_semisup_te == y_true_te) 145 | acc_comb_unsup_te = np.mean(y_comb_unsup_te == y_true_te) 146 | acc_comb_calibfirst_te = np.mean(y_comb_calibfirst_te == y_true_te) 147 | 148 | # ----- Evaluate calibration 149 | # Evaluate ECE of just model 150 | ece_m_calibrated_oracle_te = cal.get_ece(model_probs_calibrated_oracle_te, y_true_te) 151 | ece_m_calibrated_unsup_te = cal.get_ece(model_probs_calibrated_unsup_te, y_true_te) 152 | ece_m_calibrated_semisup_te = cal.get_ece(model_probs_calibrated_semisup_te, y_true_te) 153 | ece_m_calibrated_calibfirst_te = cal.get_ece(model_probs_calibrated_calibfirst_te, y_true_te) 154 | 155 | # Evaluate ECE of combination 156 | ece_combo_oracle_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te) 157 | ece_combo_unsup_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te) 158 | ece_combo_semisup_te = cal.get_ece(y_comb_prob_semisup_te, y_true_te) 159 | ece_combo_calibfirst_te = cal.get_ece(y_comb_prob_calibfirst_te, y_true_te) 160 | 161 | # Write results to CSV 162 | with open(out_fpath, 'a', newline='') as f: 163 | writer = csv.writer(f) 164 | writer.writerow([model_name, noise_level, epoch, i, n_l, n_u, 165 | acc_h_te, acc_m_te, 166 | acc_comb_oracle_te, acc_comb_unsup_te, acc_comb_semisup_te, 167 | acc_comb_calibfirst_te, 168 | ece_m_te, ece_m_calibrated_oracle_te, ece_m_calibrated_unsup_te, 169 | ece_m_calibrated_semisup_te, ece_m_calibrated_calibfirst_te, 170 | ece_combo_oracle_te, ece_combo_unsup_te, ece_combo_semisup_te, 171 | ece_combo_calibfirst_te]) 172 | 173 | 174 | if __name__ == '__main__': 175 | out_fpath = './output/' 176 | if os.path.exists(out_fpath): 177 | print('Output filepath exists, dont overwrite it!') 178 | quit() 179 | run_experiment(out_fpath) 180 | -------------------------------------------------------------------------------- /experiments/calibration_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from utils import * 6 | from combination_methods import * 7 | from tqdm.auto import tqdm 8 | import torch 9 | from sklearn.model_selection import train_test_split 10 | from metrics import * 11 | import csv 12 | import numpy as np 13 | import os 14 | 15 | 16 | # Generates the data for Appendix C in our paper. 17 | 18 | def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs): 19 | seed = kwargs.pop('seed', 0) 20 | n_runs = kwargs.pop('n_runs', 25) 21 | test_size = kwargs.pop('test_size', 0.3) 22 | calibration_methods = kwargs.pop('calibration_methods', ['none']) 23 | calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE}) 24 | output_file_acc = kwargs.pop('output_file_acc', './acc.csv') 25 | output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv') 26 | 27 | acc_data = [] 28 | cal_data = [] 29 | for i in tqdm(range(n_runs), leave=False, desc='Runs'): 30 | # Train/test split 31 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 32 | y_h, model_probs, y_true, test_size=test_size, random_state=i * seed) 33 | 34 | acc_h = get_acc(y_h_te, y_true_te) 35 | acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te) 36 | 37 | _acc_data = [acc_h, acc_m] 38 | _cal_data = [] 39 | for calibration_method in calibration_methods: 40 | if calibration_method == 'confusion': 41 | combiner = DoubleConfusionCombiner() 42 | combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 43 | else: 44 | combiner = OracleCombiner(calibration_method=calibration_method) 45 | combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 46 | 47 | y_comb_te = combiner.combine(model_probs_te, y_h_te) 48 | acc_comb = get_acc(y_comb_te, y_true_te) 49 | _acc_data.append(acc_comb) 50 | 51 | model_probs_calibrated_te = combiner.calibrate(model_probs_te) 52 | y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te) 53 | for metric, fxn in calibration_metrics.items(): 54 | cal_m = fxn(model_probs_calibrated_te, y_true_te) 55 | cal_comb = fxn(y_comb_prob_te, y_true_te) 56 | _cal_data.append([calibration_method, metric, cal_m, cal_comb]) 57 | 58 | acc_data += [_acc_data] 59 | cal_data += _cal_data 60 | 61 | # Save data to CSV 62 | header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods] 63 | with open(output_file_acc, 'w', newline='') as f: 64 | writer = csv.writer(f) 65 | writer.writerow(header_acc) 66 | writer.writerows(acc_data) 67 | header_cal = ['calibration method', 'metric', 'model', 'comb'] 68 | with open(output_file_calibration, 'w', newline='') as f: 69 | writer = csv.writer(f) 70 | writer.writerow(header_cal) 71 | writer.writerows(cal_data) 72 | 73 | 74 | def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0): 75 | model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40'] 76 | for model_name in tqdm(model_names, desc='Models', leave=True): 77 | # Specify output files 78 | output_file_acc = out_fpath + f'{model_name}_accuracy.csv' 79 | output_file_calibration = out_fpath + f'{model_name}_calibration.csv' 80 | assert not os.path.exists(output_file_acc), 'Output filepath already exists' 81 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists' 82 | experiment_args['output_file_acc'] = output_file_acc 83 | experiment_args['output_file_calibration'] = output_file_calibration 84 | 85 | # Load data 86 | human_counts, model_probs, y_true = load_CIFAR10H(model_name) 87 | y_h = simulate_single_human(human_counts, seed=seed) 88 | 89 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args) 90 | 91 | 92 | def run_experiment_noisy_imagenet(out_fpath=None, experiment_args=None, seed=0): 93 | model_acc_levels = ['high'] # ['low', 'med', 'high'] 94 | noise_levels = [80, 95, 110, 125] 95 | model_names = ['vgg19', 'googlenet'] 96 | 97 | for model_name in model_names: 98 | for model_level in tqdm(model_acc_levels, desc='Models', leave=True): 99 | for noise_level in tqdm(noise_levels, desc='Noise Levels'): 100 | # Specify output files 101 | output_file_acc = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_accuracy.csv' 102 | output_file_calibration = out_fpath + f'{model_name}_n{noise_level}_l{model_level}_calibration.csv' 103 | assert not os.path.exists(output_file_acc), 'Output filepath already exists' 104 | assert not os.path.exists(output_file_calibration), 'Output filepath already exists' 105 | experiment_args['output_file_acc'] = output_file_acc 106 | experiment_args['output_file_calibration'] = output_file_calibration 107 | 108 | # Load data 109 | y_true, y_h, model_probs = load_noisy_imagenet(model_name, noise_level, model_level) 110 | 111 | _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args) 112 | 113 | 114 | if __name__ == '__main__': 115 | seed = 9658 116 | torch.manual_seed(seed) 117 | np.random.seed(seed) 118 | 119 | calibration_methods = ['none', 'confusion', 'temperature scaling', 'ensemble temperature scaling', 'imax binning'] 120 | """ 121 | calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'), 122 | 'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'), 123 | 'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'), 124 | 'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'), 125 | 'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width', 126 | threshold_mode=None), 127 | 'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass', 128 | threshold_mode=None), 129 | 'kumar MCE': get_MCE, 130 | 'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y, 131 | p=1, debias=False, mode='marginal'), 132 | 'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y, 133 | p=1, debias=False, 134 | mode='marginal'), 135 | 'kumar ECE': cal.get_ece} 136 | """ 137 | calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'), 138 | 'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'), 139 | 'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'), 140 | 'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'), 141 | 'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None), 142 | 'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None), 143 | 'NLL': get_NLL} 144 | 145 | args = {'n_runs': 25, 146 | 'test_size': 0.3, 147 | 'calibration_methods': calibration_methods, 148 | 'calibration_metrics': calibration_metrics, 149 | 'seed': seed 150 | } 151 | 152 | out_fpath = './output/cifar10h/final/fully_sup_CI/' 153 | run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed) 154 | 155 | out_fpath = './output/noisy_imagenet/final/fully_sup_CI/' 156 | run_experiment_noisy_imagenet(out_fpath=out_fpath, experiment_args=args, seed=seed) 157 | -------------------------------------------------------------------------------- /experiments/calibration_method_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from combination_methods import * 6 | from tqdm.auto import tqdm 7 | from sklearn.model_selection import train_test_split 8 | import calibration as cal 9 | import csv 10 | import os.path 11 | 12 | # note: this experiment does not appear in our paper. 13 | 14 | def run_experiment(out_fpath=None): 15 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet 16 | """ 17 | assert out_fpath is not None, 'Must specify output filepath' 18 | # Create CSV output file if needed, write header 19 | with open(out_fpath, 'a', newline='') as f: 20 | writer = csv.writer(f) 21 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 22 | 'acc_h', 'acc_m', 'acc_combo_TS', 'acc_combo_dir', 23 | 'ce_m', 'ce_m_TS', 'ce_m_dir', 'ce_combo_TS', 'ce_combo_dir', 24 | 'ece_m_te', 'ece_m_TS', 'ece_m_dir', 'ece_combo_TS', 'ece_combo_dir']) 25 | 26 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'] 27 | model_names = ['densenet161'] 28 | noise_levels = [80, 95, 110, 125] 29 | # epochs = [None, 0, 1, 10] 30 | epochs = [None, 0, 10] 31 | noise_type = 'phase' 32 | n_runs = 5 33 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True): 34 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'): 35 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'): 36 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'): 37 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet_data(noise_level, model_name, 38 | n_epochs=epoch, noise_type=noise_type) 39 | # Train/test split 70/30 40 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 41 | y_h, model_probs, y_true, test_size=0.3, random_state=i) 42 | 43 | # ----- Calibrator: temperature scaling 44 | oracle_combiner_TS = OracleCombiner(calibration_method='temperature scaling') 45 | oracle_combiner_TS.fit(model_probs_tr, y_h_tr, y_true_tr) 46 | y_comb_prob_TS = oracle_combiner_TS.combine_proba(model_probs_te, y_h_te) 47 | y_comb_TS = oracle_combiner_TS.combine(model_probs_te, y_h_te) 48 | 49 | # ----- Calibrator: Dirichlet calibration 50 | oracle_combiner_dirichlet = OracleCombiner(calibration_method='dirichlet') 51 | oracle_combiner_dirichlet.fit(model_probs_tr, y_h_tr, y_true_tr) 52 | y_comb_prob_dirichlet = oracle_combiner_dirichlet.combine_proba(model_probs_te, y_h_te) 53 | y_comb_dirichlet = oracle_combiner_dirichlet.combine(model_probs_te, y_h_te) 54 | 55 | # ----- Evaluate accuracies 56 | acc_comb_oracle_TS = np.mean(y_comb_TS == y_true_te) 57 | acc_comb_oracle_dirichlet = np.mean(y_comb_dirichlet == y_true_te) 58 | acc_h_te = np.mean(y_h_te == y_true_te) 59 | y_m_te = np.argmax(model_probs_te, axis=1) 60 | acc_m_te = np.mean(y_m_te == y_true_te) 61 | 62 | # ----- Evaluate calibration 63 | # NB: This is the \ell_2, debiased, marginal calibration error 64 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te, 65 | p=2, debias=True, mode='marginal') 66 | ce_m_TS = cal.get_calibration_error(oracle_combiner_TS.calibrate(model_probs_te), y_true_te, 67 | p=2, debias=True, mode='marginal') 68 | ce_m_dirichlet = cal.get_calibration_error(oracle_combiner_dirichlet.calibrate(model_probs_te), 69 | y_true_te, 70 | p=2, debias=True, mode='marginal') 71 | ce_combo_TS = cal.get_calibration_error(y_comb_prob_TS, y_true_te, 72 | p=2, debias=True, mode='marginal') 73 | ce_combo_dirichlet = cal.get_calibration_error(y_comb_prob_dirichlet, y_true_te, 74 | p=2, debias=True, mode='marginal') 75 | 76 | # NB: This is the usual ECE 77 | ece_m_te = cal.get_ece(model_probs_te, y_true_te) 78 | ece_m_TS = cal.get_ece(oracle_combiner_TS.calibrate(model_probs_te), y_true_te) 79 | ece_m_dirichlet = cal.get_ece(oracle_combiner_dirichlet.calibrate(model_probs_te), 80 | y_true_te) 81 | ece_combo_TS = cal.get_ece(y_comb_prob_TS, y_true_te) 82 | ece_combo_dirichlet = cal.get_ece(y_comb_prob_dirichlet, y_true_te) 83 | 84 | # Write results to CSV 85 | with open(out_fpath, 'a', newline='') as f: 86 | # TODO : Save model name, noise level, epoch 87 | writer = csv.writer(f) 88 | writer.writerow([model_name, noise_level, epoch, i, 89 | acc_h_te, acc_m_te, acc_comb_oracle_TS, acc_comb_oracle_dirichlet, 90 | ce_m_te, ce_m_TS, ce_m_dirichlet, ce_combo_TS, ce_combo_dirichlet, 91 | ece_m_te, ece_m_TS, ece_m_dirichlet, ece_combo_TS, ece_combo_dirichlet]) 92 | 93 | 94 | if __name__ == '__main__': 95 | out_fpath = './output/' 96 | run_experiment(out_fpath) 97 | -------------------------------------------------------------------------------- /experiments/em_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | import torch 6 | from calibrators import * 7 | from tqdm.auto import tqdm 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.metrics import confusion_matrix 10 | import calibration as cal 11 | from combination_methods import * 12 | import csv 13 | 14 | # note: this experiment does not appear in our paper and may contain outdated code. 15 | 16 | """ 17 | def em_combo_experiment(): 18 | # TODO : This code is old and needs updated for CIFAR-10H experiments 19 | # Calibrates and estimates confusion on held-out train set 20 | # Combines on test set and evaluates 21 | 22 | human_counts, model_probs, true_labels = load_CIFAR10H() 23 | true_labels = true_labels.astype(int) 24 | model_logits = np.log(model_probs) 25 | # Simulate a single human labeler 26 | y_h = get_human_labels_outcomes(human_counts, true_labels)[0][:, 0].astype(int) 27 | n = y_h.size 28 | 29 | temp, conf_h = fit_EM(model_probs, y_h) 30 | calibrated_model_probs = calibrate_probs_TS(model_probs, temp) 31 | y_comb = combine(calibrated_model_probs, y_h, conf_h) 32 | 33 | return np.mean(y_comb == true_labels) 34 | """ 35 | 36 | 37 | def run_experiment(out_fpath=None): 38 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on Noisy ImageNet 39 | """ 40 | assert out_fpath is not None, 'Must specify output filepath' 41 | # Create CSV output file if needed, write header 42 | with open(out_fpath, 'a', newline='') as f: 43 | writer = csv.writer(f) 44 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 45 | 'acc_h_te', 'acc_m_te', 'acc_comb_oracle_te', 'acc_comb_te', 46 | 'ce_m_te', 'ce_m_calibrated_te', 'ce_combo_te', 'ce_oracle_combo_te', 'ce_m_oracle_te', 47 | 'ece_m_te', 48 | 'ece_m_calibrated_te', 'ece_combo_te', 'ece_oracle_combo_te', 'ece_m_oracle_te', 49 | 'frobenius_distance_conf_te']) 50 | 51 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'] 52 | model_names = ['densenet161'] 53 | noise_levels = [80, 95, 110, 125] 54 | # epochs = [None, 0, 1, 10] 55 | epochs = [None, 0, 10] 56 | noise_type = 'phase' 57 | n_runs = 5 58 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True): 59 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'): 60 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'): 61 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'): 62 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet(noise_level, model_name, 63 | n_epochs=epoch, noise_type=noise_type) 64 | # Train/test split 70/30 65 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 66 | y_h, model_probs, y_true, test_size=0.3, random_state=i) 67 | 68 | # ----- 'Orancle' Experiment 69 | y_comb_oracle_soft_te = oracle_combo(y_h_tr, model_probs_tr, y_true_tr, model_probs_te, y_h_te) 70 | y_comb_oracle_te = np.argmax(y_comb_oracle_soft_te, axis=1) 71 | 72 | temp_oracle = temperature_scaling(torch.from_numpy(np.log(model_probs_tr)), 73 | torch.from_numpy(y_true_tr))['temperature'].item() 74 | calibrated_probs_te_oracle = calibrate_probs_TS(model_probs_te, temp_oracle) 75 | 76 | # ----- EM Experiment 77 | # Fit EM parameters on train set 78 | calibrator, conf_h = fit_EM(model_probs_tr, y_h_tr) # TODO: Different calibration methods 79 | # Calibrate predictions on test set 80 | # TODO: Different calibration methods 81 | model_probs_calibrated_te = calibrator.calibrate(model_probs_te) 82 | # Combine calibrated model predictions with human labels on test set 83 | y_comb_te_soft = combine(model_probs_calibrated_te, y_h_te, conf_h) 84 | y_comb_te = np.argmax(y_comb_te_soft, axis=1) 85 | 86 | # ----- Evaluate accuracies 87 | acc_comb_te = np.mean(y_comb_te == y_true_te) 88 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te) 89 | acc_h_te = np.mean(y_h_te == y_true_te) 90 | y_m_te = np.argmax(model_probs_te, axis=1) 91 | acc_m_te = np.mean(y_m_te == y_true_te) 92 | 93 | # ----- Evaluate calibration 94 | # NB: This is the \ell_2, debiased, marginal calibration error 95 | ce_m_te = cal.get_calibration_error(model_probs_te, y_true_te, 96 | p=2, debias=True, mode='marginal') 97 | ce_m_calibrated_te = cal.get_calibration_error(model_probs_calibrated_te, y_true_te, 98 | p=2, debias=True, mode='marginal') 99 | ce_combo_te = cal.get_calibration_error(y_comb_te_soft, y_true_te, 100 | p=2, debias=True, mode='marginal') 101 | ce_oracle_combo_te = cal.get_calibration_error(y_comb_oracle_soft_te, y_true_te, 102 | p=2, debias=True, mode='marginal') 103 | ce_m_oracle_te = cal.get_calibration_error(calibrated_probs_te_oracle, y_true_te, 104 | p=2, debias=True, mode='marginal') 105 | 106 | # NB: This is the usual ECE 107 | ece_m_te = cal.get_ece(model_probs_te, y_true_te) 108 | ece_m_calibrated_te = cal.get_ece(model_probs_calibrated_te, y_true_te) 109 | ece_combo_te = cal.get_ece(y_comb_te_soft, y_true_te) 110 | ece_oracle_combo_te = cal.get_ece(y_comb_oracle_soft_te, y_true_te) 111 | ece_m_oracle_te = cal.get_ece(calibrated_probs_te_oracle, y_true_te) 112 | 113 | # Evaluate confusion matrix 114 | # Entry [i,j] is P(h = i | Y = j) 115 | conf_h_te = confusion_matrix(y_true_te, y_h_te, normalize='pred').T 116 | # Computes the Frobenius-norm (RMSE) distance between: 117 | # (i) human confusion matrix estimated via EM 118 | # (ii) human confusion matrix directly estimated via ground-truth on test set 119 | frobenius_distance_conf_te = np.linalg.norm(conf_h_te - conf_h) 120 | 121 | # Write results to CSV 122 | with open(out_fpath, 'a', newline='') as f: 123 | # TODO : Save model name, noise level, epoch 124 | writer = csv.writer(f) 125 | writer.writerow([model_name, noise_level, epoch, i, 126 | acc_h_te, acc_m_te, acc_comb_oracle_te, acc_comb_te, 127 | ce_m_te, ce_m_calibrated_te, ce_combo_te, ce_oracle_combo_te, ce_m_oracle_te, 128 | ece_m_te, ece_m_calibrated_te, ece_combo_te, ece_oracle_combo_te, 129 | ece_m_oracle_te, 130 | frobenius_distance_conf_te]) 131 | 132 | 133 | if __name__ == '__main__': 134 | out_fpath = './output/' 135 | run_experiment(out_fpath) 136 | -------------------------------------------------------------------------------- /experiments/semisup_em_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from combination_methods import * 6 | from tqdm.auto import tqdm 7 | from sklearn.model_selection import train_test_split 8 | import calibration as cal 9 | import csv 10 | import os.path 11 | 12 | # note: these experiments do not appear in our paper and may contain outdated code. 13 | 14 | 15 | def run_experiment(out_fpath=None): 16 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on noisy ImageNet. 17 | 18 | Done in a semi-supervised fashion. 19 | """ 20 | assert out_fpath is not None, 'Must specify output filepath' 21 | # Create CSV output file if needed, write header 22 | with open(out_fpath, 'a', newline='') as f: 23 | writer = csv.writer(f) 24 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 'n_l', 'n_u', 25 | 'acc_h_te', 'acc_m_te', 26 | 'acc_comb_oracle_te', 'acc_comb_unsup_te', 'acc_comb_semisup_te', 27 | 'ece_m_te', 'ece_m_calibrated_oracle_te', 'ece_m_calibrated_unsup_te', 28 | 'ece_m_calibrated_semisup_te', 29 | 'ece_combo_oracle_te', 'ece_combo_unsup_te', 'ece_combo_semisup_te']) 30 | 31 | # Experiment parameters 32 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'] 33 | model_names = ['densenet161'] 34 | noise_levels = [80, 95, 110, 125] 35 | # epochs = [None, 0, 1, 10] 36 | epochs = [None, 0, 10] 37 | noise_type = 'phase' 38 | n_runs = 25 39 | 40 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True): 41 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'): 42 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'): 43 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet(noise_level, model_name, 44 | n_epochs=epoch, noise_type=noise_type) 45 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'): 46 | # Train/test split 70/30 47 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 48 | y_h, model_probs, y_true, test_size=0.3, random_state=i) 49 | 50 | n_tr = y_h_tr.size # Number of training points 51 | 52 | # Evaluate accuracies of things that don't change with n_l 53 | acc_h_te = np.mean(y_h_te == y_true_te) 54 | y_m_te = np.argmax(model_probs_te, axis=1) 55 | acc_m_te = np.mean(y_m_te == y_true_te) 56 | # Evaluate calibration of things that don't change with n_l 57 | ece_m_te = cal.get_ece(model_probs_te, y_true_te) 58 | 59 | # ----- Unsupervised EM, all unlabeled data 60 | # Edge case with n_l = 0 61 | unsupervised_EM_combiner = UnsupervisedEMCombiner() 62 | unsupervised_EM_combiner.fit(model_probs_tr, y_h_tr) 63 | y_comb_prob_unsup_te = unsupervised_EM_combiner.combine_proba(model_probs_te, y_h_te) 64 | y_comb_unsup_te = unsupervised_EM_combiner.combine(model_probs_te, y_h_te) 65 | 66 | acc_combo_unsup_all_te = np.mean(y_comb_unsup_te == y_true_te) 67 | ece_m_calibrated_unsup_all_te = cal.get_ece(unsupervised_EM_combiner.calibrate(model_probs_te), 68 | y_true_te) 69 | ece_combo_unsup_all_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te) 70 | 71 | with open(out_fpath, 'a', newline='') as f: 72 | writer = csv.writer(f) 73 | writer.writerow([model_name, noise_level, epoch, i, 0, n_tr, 74 | acc_h_te, acc_m_te, 75 | None, acc_combo_unsup_all_te, None, 76 | ece_m_te, None, ece_m_calibrated_unsup_all_te, 77 | None, 78 | None, ece_combo_unsup_all_te, None]) 79 | 80 | # ----- Fully supervised (oracle) combo 81 | # Edge case with n_l = all 82 | oracle_combiner = OracleCombiner() 83 | oracle_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 84 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te) 85 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te) 86 | 87 | acc_combo_oracle_all_te = np.mean(y_comb_oracle_te == y_true_te) 88 | ece_m_calibrated_oracle_all_te = cal.get_ece(oracle_combiner.calibrate(model_probs_te), 89 | y_true_te) 90 | ece_combo_oracle_all_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te) 91 | 92 | with open(out_fpath, 'a', newline='') as f: 93 | writer = csv.writer(f) 94 | writer.writerow([model_name, noise_level, epoch, i, n_tr, 0, 95 | acc_h_te, acc_m_te, 96 | None, acc_combo_oracle_all_te, None, 97 | ece_m_te, None, ece_m_calibrated_oracle_all_te, 98 | None, 99 | None, ece_combo_oracle_all_te, None]) 100 | 101 | # TODO : Do these edge cases really need to be separated out?? 102 | 103 | n_l_sizes = [10, 50, 100, 250, 500, 1000, 2500, 4500] # Amount of labeled data to use 104 | for n_l in tqdm(n_l_sizes, leave=False, desc='Num. Labels'): 105 | # Split into labeled / unlabeled datasets 106 | n_u = n_tr - n_l 107 | y_h_tr_u, y_h_tr_l = y_h_tr[n_l:], y_h_tr[:n_l] 108 | model_probs_tr_u, model_probs_tr_l = model_probs_tr[n_l:], model_probs_tr[:n_l] 109 | y_true_tr_l = y_true_tr[:n_l] 110 | 111 | # ----- Labeled data only 112 | oracle_combiner = OracleCombiner() 113 | oracle_combiner.fit(model_probs_tr_l, y_h_tr_l, y_true_tr_l) 114 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te) 115 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te) 116 | model_probs_calibrated_oracle_te = oracle_combiner.calibrate(model_probs_te) 117 | 118 | # ----- Semi-Supervised EM 119 | # Fit parameters, calibrate test set, combine test set 120 | semisup_combiner = SemiSupervisedEMCombiner() 121 | semisup_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l) 122 | y_comb_prob_semisup_te = semisup_combiner.combine_proba(model_probs_te, y_h_te) 123 | y_comb_semisup_te = semisup_combiner.combine(model_probs_te, y_h_te) 124 | model_probs_calibrated_semisup_te = semisup_combiner.calibrate(model_probs_te) 125 | 126 | # ----- Unsupervised EM 127 | unsup_combiner = UnsupervisedEMCombiner() 128 | unsup_combiner.fit(model_probs_tr_u, y_h_tr_u) 129 | y_comb_prob_unsup_te = unsup_combiner.combine_proba(model_probs_te, y_h_te) 130 | y_comb_unsup_te = unsup_combiner.combine(model_probs_te, y_h_te) 131 | model_probs_calibrated_unsup_te = unsup_combiner.calibrate(model_probs_te) 132 | 133 | # ----- Evaluate accuracies 134 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te) 135 | acc_comb_semisup_te = np.mean(y_comb_semisup_te == y_true_te) 136 | acc_comb_unsup_te = np.mean(y_comb_unsup_te == y_true_te) 137 | 138 | # ----- Evaluate calibration 139 | # Evaluate ECE of just model 140 | ece_m_calibrated_oracle_te = cal.get_ece(model_probs_calibrated_oracle_te, y_true_te) 141 | ece_m_calibrated_unsup_te = cal.get_ece(model_probs_calibrated_unsup_te, y_true_te) 142 | ece_m_calibrated_semisup_te = cal.get_ece(model_probs_calibrated_semisup_te, y_true_te) 143 | 144 | # Evaluate ECE of combination 145 | ece_combo_oracle_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te) 146 | ece_combo_unsup_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te) 147 | ece_combo_semisup_te = cal.get_ece(y_comb_prob_semisup_te, y_true_te) 148 | 149 | # Write results to CSV 150 | with open(out_fpath, 'a', newline='') as f: 151 | writer = csv.writer(f) 152 | writer.writerow([model_name, noise_level, epoch, i, n_l, n_u, 153 | acc_h_te, acc_m_te, 154 | acc_comb_oracle_te, acc_comb_unsup_te, acc_comb_semisup_te, 155 | ece_m_te, ece_m_calibrated_oracle_te, ece_m_calibrated_unsup_te, 156 | ece_m_calibrated_semisup_te, 157 | ece_combo_oracle_te, ece_combo_unsup_te, ece_combo_semisup_te]) 158 | 159 | 160 | 161 | if __name__ == '__main__': 162 | out_fpath = './output/' 163 | if os.path.exists(out_fpath): 164 | print('Output filepath exists, dont overwrite it!') 165 | quit() 166 | run_experiment(out_fpath) 167 | -------------------------------------------------------------------------------- /experiments/weighted_semisup_em_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | 4 | from data_utils import * 5 | from combination_methods import * 6 | from tqdm.auto import tqdm 7 | from sklearn.model_selection import train_test_split 8 | import calibration as cal 9 | import csv 10 | import os.path 11 | 12 | # note: these experiments do not appear in our paper and may contain outdated code. 13 | 14 | 15 | def run_experiment(out_fpath=None): 16 | """ Evaluates the oracle and EM algorithms (in terms of accuracy and calibration) on Noisy ImageNet 17 | 18 | Done in a semi-supervised fashion. 19 | """ 20 | assert out_fpath is not None, 'Must specify output filepath' 21 | 22 | unsupervised_weights = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99] 23 | 24 | # Create CSV output file if needed, write header 25 | with open(out_fpath, 'a', newline='') as f: 26 | writer = csv.writer(f) 27 | writer.writerow(['model_name', 'noise_level', 'epochs', 'trial', 'n_l', 'n_u', 28 | 'acc_h_te', 'acc_m_te', 29 | 'acc_comb_oracle_te', 'acc_comb_unsup_te'] 30 | + ['acc_comb_semisup_te_weight{}'.format(v) for v in unsupervised_weights] + 31 | ['ece_m_te', 'ece_m_calibrated_oracle_te', 'ece_m_calibrated_unsup_te'] 32 | + ['ece_m_calibrated_semisup_te_weight{}'.format(v) for v in unsupervised_weights] + 33 | ['ece_combo_oracle_te', 'ece_combo_unsup_te'] 34 | + ['ece_combo_semisup_te_weight{}'.format(v) for v in unsupervised_weights]) 35 | 36 | # Experiment parameters 37 | # model_names = ['alexnet', 'densenet161', 'googlenet', 'resnet152', 'vgg19'] 38 | model_names = ['densenet161'] 39 | noise_levels = [80, 95, 110, 125] 40 | # epochs = [None, 0, 1, 10] 41 | epochs = [None, 0, 10] 42 | noise_type = 'phase' 43 | 44 | n_runs = 10 45 | 46 | for model_name in tqdm(model_names, desc='Models', position=0, leave=True): 47 | for noise_level in tqdm(noise_levels, position=1, leave=False, desc='Noise Levels'): 48 | for epoch in tqdm(epochs, position=2, leave=False, desc='Epochs'): 49 | human_ids, y_h, y_true, model_probs = load_old_noisy_imagenet(noise_level, model_name, 50 | n_epochs=epoch, noise_type=noise_type) 51 | for i in tqdm(range(n_runs), position=3, leave=False, desc='Runs'): 52 | # Train/test split 70/30 53 | y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split( 54 | y_h, model_probs, y_true, test_size=0.3, random_state=i) 55 | 56 | n_tr = y_h_tr.size # Number of training points 57 | 58 | # Evaluate accuracies of things that don't change with n_l 59 | acc_h_te = np.mean(y_h_te == y_true_te) 60 | y_m_te = np.argmax(model_probs_te, axis=1) 61 | acc_m_te = np.mean(y_m_te == y_true_te) 62 | # Evaluate calibration of things that don't change with n_l 63 | ece_m_te = cal.get_ece(model_probs_te, y_true_te) 64 | 65 | # ----- Unsupervised EM, all unlabeled data 66 | # Edge case with n_l = 0 67 | unsupervised_EM_combiner = UnsupervisedEMCombiner() 68 | unsupervised_EM_combiner.fit(model_probs_tr, y_h_tr) 69 | y_comb_prob_unsup_te = unsupervised_EM_combiner.combine_proba(model_probs_te, y_h_te) 70 | y_comb_unsup_te = unsupervised_EM_combiner.combine(model_probs_te, y_h_te) 71 | 72 | acc_combo_unsup_all_te = np.mean(y_comb_unsup_te == y_true_te) 73 | ece_m_calibrated_unsup_all_te = cal.get_ece(unsupervised_EM_combiner.calibrate(model_probs_te), 74 | y_true_te) 75 | ece_combo_unsup_all_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te) 76 | 77 | with open(out_fpath, 'a', newline='') as f: 78 | writer = csv.writer(f) 79 | writer.writerow([model_name, noise_level, epoch, i, 0, n_tr, 80 | acc_h_te, acc_m_te, 81 | None, acc_combo_unsup_all_te] + [None] * len(unsupervised_weights) + 82 | [ece_m_te, None, ece_m_calibrated_unsup_all_te] 83 | + [None] * len(unsupervised_weights) + 84 | [None, ece_combo_unsup_all_te] 85 | + [None] * len(unsupervised_weights)) 86 | 87 | # ----- Fully supervised (oracle) combo 88 | # Edge case with n_l = all 89 | oracle_combiner = OracleCombiner() 90 | oracle_combiner.fit(model_probs_tr, y_h_tr, y_true_tr) 91 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te) 92 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te) 93 | 94 | acc_combo_oracle_all_te = np.mean(y_comb_oracle_te == y_true_te) 95 | ece_m_calibrated_oracle_all_te = cal.get_ece(oracle_combiner.calibrate(model_probs_te), 96 | y_true_te) 97 | ece_combo_oracle_all_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te) 98 | with open(out_fpath, 'a', newline='') as f: 99 | writer = csv.writer(f) 100 | writer.writerow([model_name, noise_level, epoch, i, n_tr, 0, 101 | acc_h_te, acc_m_te, 102 | acc_combo_oracle_all_te, None] + [None] * len(unsupervised_weights) + 103 | [ece_m_te, ece_m_calibrated_oracle_all_te, None] 104 | + [None] * len(unsupervised_weights) + 105 | [ece_combo_oracle_all_te, None] 106 | + [None] * len(unsupervised_weights)) 107 | 108 | n_l_sizes = [10, 50, 100, 250, 500, 1000, 2500, 4500] # Amount of labeled data to use 109 | for n_l in tqdm(n_l_sizes, leave=False, desc='Num. Labels'): 110 | # Split into labeled / unlabeled datasets 111 | n_u = n_tr - n_l 112 | y_h_tr_u, y_h_tr_l = y_h_tr[n_l:], y_h_tr[:n_l] 113 | model_probs_tr_u, model_probs_tr_l = model_probs_tr[n_l:], model_probs_tr[:n_l] 114 | y_true_tr_l = y_true_tr[:n_l] 115 | 116 | # ----- Labeled data only 117 | oracle_combiner = OracleCombiner() 118 | oracle_combiner.fit(model_probs_tr_l, y_h_tr_l, y_true_tr_l) 119 | y_comb_prob_oracle_te = oracle_combiner.combine_proba(model_probs_te, y_h_te) 120 | y_comb_oracle_te = oracle_combiner.combine(model_probs_te, y_h_te) 121 | model_probs_calibrated_oracle_te = oracle_combiner.calibrate(model_probs_te) 122 | 123 | # ----- Semi-Supervised EM 124 | # Fit once for each choice in unsupervised_weights 125 | y_comb_prob_semisup_te = dict.fromkeys(unsupervised_weights) 126 | y_comb_semisup_te = dict.fromkeys(unsupervised_weights) 127 | model_probs_calibrated_semisup_te = dict.fromkeys(unsupervised_weights) 128 | for unsupervised_weight in tqdm(unsupervised_weights, leave=False, desc='Weights'): 129 | semisup_combiner = SemiSupervisedEMCombiner(unsupervised_weight=unsupervised_weight) 130 | semisup_combiner.fit(model_probs_tr_u, y_h_tr_u, model_probs_tr_l, y_h_tr_l, y_true_tr_l) 131 | y_comb_prob_semisup_te[unsupervised_weight] = semisup_combiner.combine_proba(model_probs_te, 132 | y_h_te) 133 | y_comb_semisup_te[unsupervised_weight] = semisup_combiner.combine(model_probs_te, y_h_te) 134 | model_probs_calibrated_semisup_te[unsupervised_weight] = semisup_combiner.calibrate( 135 | model_probs_te) 136 | 137 | # ----- Unsupervised EM 138 | unsup_combiner = UnsupervisedEMCombiner() 139 | unsup_combiner.fit(model_probs_tr_u, y_h_tr_u) 140 | y_comb_prob_unsup_te = unsup_combiner.combine_proba(model_probs_te, y_h_te) 141 | y_comb_unsup_te = unsup_combiner.combine(model_probs_te, y_h_te) 142 | model_probs_calibrated_unsup_te = unsup_combiner.calibrate(model_probs_te) 143 | 144 | # ----- Evaluate accuracies 145 | acc_comb_oracle_te = np.mean(y_comb_oracle_te == y_true_te) 146 | acc_comb_semisup_te = dict.fromkeys(unsupervised_weights) 147 | for unsupervised_weight in unsupervised_weights: 148 | acc_comb_semisup_te[unsupervised_weight] = np.mean( 149 | y_comb_semisup_te[unsupervised_weight] == y_true_te) 150 | acc_comb_unsup_te = np.mean(y_comb_unsup_te == y_true_te) 151 | 152 | # ----- Evaluate calibration 153 | # Evaluate ECE of just model 154 | ece_m_calibrated_oracle_te = cal.get_ece(model_probs_calibrated_oracle_te, y_true_te) 155 | ece_m_calibrated_unsup_te = cal.get_ece(model_probs_calibrated_unsup_te, y_true_te) 156 | ece_m_calibrated_semisup_te = dict.fromkeys(unsupervised_weights) 157 | for unsupervised_weight in unsupervised_weights: 158 | ece_m_calibrated_semisup_te[unsupervised_weight] = cal.get_ece( 159 | model_probs_calibrated_semisup_te[unsupervised_weight], y_true_te) 160 | 161 | # Evaluate ECE of combination 162 | ece_combo_oracle_te = cal.get_ece(y_comb_prob_oracle_te, y_true_te) 163 | ece_combo_unsup_te = cal.get_ece(y_comb_prob_unsup_te, y_true_te) 164 | ece_combo_semisup_te = dict.fromkeys(unsupervised_weights) 165 | for unsupervised_weight in unsupervised_weights: 166 | ece_combo_semisup_te[unsupervised_weight] = cal.get_ece( 167 | y_comb_prob_semisup_te[unsupervised_weight], y_true_te) 168 | 169 | # Write results to CSV 170 | with open(out_fpath, 'a', newline='') as f: 171 | writer = csv.writer(f) 172 | writer.writerow([model_name, noise_level, epoch, i, n_l, n_u, 173 | acc_h_te, acc_m_te, 174 | acc_comb_oracle_te, acc_comb_unsup_te] 175 | + [v for v in acc_comb_semisup_te.values()] + 176 | [ece_m_te, ece_m_calibrated_oracle_te, ece_m_calibrated_unsup_te] 177 | + [v for v in ece_m_calibrated_semisup_te.values()] + 178 | [ece_combo_oracle_te, ece_combo_unsup_te] 179 | + [v for v in ece_combo_semisup_te.values()]) 180 | 181 | 182 | if __name__ == '__main__': 183 | out_fpath = './output/' 184 | if os.path.exists(out_fpath): 185 | print('Output filepath exists, dont overwrite it!') 186 | quit() 187 | run_experiment(out_fpath) 188 | -------------------------------------------------------------------------------- /imax_calib/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2019 Robert Bosch GmbH 3 | # 4 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | # 18 | # Author: Kanil Patel 19 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /imax_calib/calibration.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 3 | # All rights reserved. 4 | ## 5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Author: Kanil Patel 20 | # -*- coding: utf-8 -*- 21 | ''' 22 | calibration.py 23 | imax_calib 24 | 25 | Created by Kanil Patel on 07/28/20. 26 | Copyright 2020. Kanil Patel. All rights reserved. 27 | ''' 28 | import os 29 | import numpy as np 30 | import imax_calib.io as io 31 | import imax_calib.utils as utils 32 | import imax_calib.calibrators.binners as binners 33 | import imax_calib.calibrators.scalers_np as scalers_np 34 | 35 | def learn_calibrator(cfg, logits, logodds, y, feats=None, **kwargs): 36 | """ 37 | Use this function to access all calibrators (binning). 38 | Inputs are the raw network logits and one-hot labels. 39 | The kwargs can be used to send other arguments which some calibrators might need. 40 | 41 | Parameters 42 | ---------- 43 | cfg: io.AttrDict 44 | config dictionary containing all information. 45 | logits: numpy ndarray 46 | raw network logits 47 | logodds: numpy ndarray 48 | raw network logodds. use utils.quick_logits_to_logodds(logits) to get them 49 | y: numpy ndarray 50 | one-hot target labels 51 | kwargs: dict 52 | extra arguments which some calibrators require 53 | Returns 54 | ------- 55 | 56 | cal_obj: calibrators_*.BaseCalibrator 57 | calibrator object. can be used given logits as input 58 | """ 59 | binner_obj = learn_binning(cfg, logits, logodds, y, **kwargs) 60 | return binner_obj 61 | 62 | def learn_binning(cfg, logits, logodds, y, **kwargs): 63 | """ 64 | Same as learn_calibrator() but this func specifically learns the logodds binning methods. 65 | """ 66 | # set all seeds 67 | np.random.seed(cfg.Q_rnd_seed) 68 | 69 | if cfg.Q_method is None: 70 | CALIBRATOR = scalers_np.Raw 71 | elif cfg.Q_method=="imax" or cfg.Q_method=="eqmass" or cfg.Q_method=="eqsize": 72 | if cfg.cal_setting=="CW": 73 | CALIBRATOR = binners.HistogramBinninerCW 74 | elif cfg.cal_setting=="top1": 75 | CALIBRATOR = binners.HistogramBinninerTop1 76 | elif cfg.cal_setting=="sCW": 77 | CALIBRATOR = binners.HistogramBinninerSharedCW 78 | else: 79 | raise Exception("Quantization method unknown!") 80 | 81 | cal_obj = CALIBRATOR(cfg) 82 | #print("Learning calibration parameters!") 83 | cal_obj.fit(logits, logodds, y, **kwargs) 84 | return cal_obj 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /imax_calib/calibrators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GavinKerrigan/conf_matrix_and_calibration/3b9cf13df58861a87549fe5d36017c1387aceb08/imax_calib/calibrators/__init__.py -------------------------------------------------------------------------------- /imax_calib/calibrators/scalers_np.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 3 | # All rights reserved. 4 | ### 5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Author: Kanil Patel 20 | # -*- coding: utf-8 -*- 21 | ''' 22 | calibrators_np.py 23 | imax_calib 24 | 25 | All calibration methods which require numpy functions during learning of parameters. 26 | 27 | Created by Kanil Patel on 07/27/20. 28 | Copyright 2020. Kanil Patel. All rights reserved. 29 | ''' 30 | import numpy as np 31 | import imax_calib.io as io 32 | import imax_calib.utils as utils 33 | 34 | class BaseCalibrator(): 35 | """ 36 | A generic base class. 37 | """ 38 | def __init__(self): 39 | self.parameter_list = [] 40 | 41 | def fit(self, logits, logodds, y, **kwargs): 42 | """ 43 | Function to learn the model parameters using the input data X and labels y. 44 | 45 | Parameters 46 | ---------- 47 | logits: numpy ndarray 48 | input data to the calibrator. 49 | logodds: numpy ndarray 50 | input data to the calibrator. 51 | y: numpy ndarray 52 | target labels 53 | Returns 54 | ------- 55 | 56 | """ 57 | raise NotImplementedError("Subclass must implement this method.") 58 | 59 | def calibrate(self, logits, logodds, **kwargs): 60 | """ 61 | Calibrate the data using the learned parameters after fit was already called. 62 | """ 63 | raise NotImplementedError("Subclass must implement this method.") 64 | 65 | def __call__(self, *args, **kwargs): 66 | return self.calibrate(*args, **kwargs) 67 | 68 | def save_params(self, fpath): 69 | """ 70 | Save the parameters of the model. The parameters which need to be saved are determined by self.parameter_list. 71 | Saves a single hdf5 file with keys being the parameter names. 72 | 73 | Parameters 74 | ---------- 75 | fpath: string 76 | filepath to save the hdf5 file with model parameters 77 | Returns 78 | ------- 79 | """ 80 | if len(self.parameter_list)>0: 81 | data_to_save = io.AttrDict() 82 | for key in self.parameter_list: 83 | data_to_save[key] = getattr(self, key) 84 | io.deepdish_write(fpath, data_to_save) 85 | print(io.pc._OKGREEN("Parameters written to fpath: %s"%(fpath))) 86 | 87 | def load_params(self, fpath): 88 | """ 89 | Load the parameters of the model. The parameters which need to be loaded are determined by self.parameter_list. 90 | Loads a single hdf5 file and assigns the attributes to the object using keys as the parameter names. 91 | 92 | Parameters 93 | ---------- 94 | fpath: string 95 | filepath to save the hdf5 file with model parameters 96 | Returns 97 | ------- 98 | """ 99 | if len(self.parameter_list)>0: 100 | data_to_load = io.deepdish_read(fpath) 101 | for key in self.parameter_list: 102 | setattr(self, key, data_to_load[key]) 103 | print(io.pc._OKGREEN("Parameters loaded and updated from fpath: %s"%(fpath))) 104 | 105 | 106 | 107 | 108 | class Raw(BaseCalibrator): 109 | """ 110 | The raw outputs without any calibration. Identity function. 111 | """ 112 | def __init__(self, cfg=None): 113 | super(Raw).__init__() 114 | 115 | def fit(self, logits, logodds, y, **kwargs): 116 | return self 117 | 118 | def calibrate(self, logits, logodds, **kwargs): 119 | probs = utils.to_sigmoid(logodds) 120 | return logits, logodds, probs 121 | 122 | def load_params(self, fpath): 123 | return None 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /imax_calib/clustering.py: -------------------------------------------------------------------------------- 1 | # This source code is from sklearn.cluster.kmean (https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) 2 | # Copyright (c) 2012-2014 Awesome Inc. 3 | # This source code is licensed under the MIT license found in the 4 | # 3rd-party-licenses.txt file in the root directory of this source tree. 5 | 6 | # Initialization heuristic, copied from sklearn.cluster.kmean 7 | import numpy as np 8 | from numpy.random import RandomState 9 | import scipy.sparse as sp 10 | from scipy import stats 11 | from sklearn.utils.extmath import stable_cumsum, row_norms 12 | from sklearn.metrics.pairwise import euclidean_distances 13 | 14 | def CE_mtx(logits_p_in, logits_q_in): 15 | logits_p = np.reshape(logits_p_in.astype(np.float64), [logits_p_in.shape[0], 1]) 16 | logits_q = np.reshape(logits_q_in.astype(np.float64), [1, logits_q_in.shape[0]]) 17 | CE_mtx = - logits_q * (0.5 + 0.5*np.tanh(logits_p/2.)) + np.maximum(0., logits_q) + np.log(1. + np.exp(-abs(logits_q))) 18 | return CE_mtx 19 | 20 | def KL_mtx(logits_p_in, logits_q_in): 21 | logits_p = np.reshape(logits_p_in.astype(np.float64), [logits_p_in.shape[0], 1]) 22 | logits_q = np.reshape(logits_q_in.astype(np.float64), [1, logits_q_in.shape[0]]) 23 | KL_mtx = (logits_p - logits_q) * (0.5 + 0.5*np.tanh(logits_p/2.)) + np.maximum(0., logits_q) + np.log(1. + np.exp(-abs(logits_q))) - np.maximum(0., logits_p) - np.log(1. + np.exp(-abs(logits_p))) 24 | #KL_mtx = - logits_q * (0.5 + 0.5*np.tanh(logits_p/2.)) + np.maximum(0., logits_q) + np.log(1. + np.exp(-abs(logits_q))) 25 | return KL_mtx 26 | 27 | def JSD_mtx(logits_p, logits_q): 28 | logits_p_a = np.reshape(logits_p.astype(np.float64), [logits_p.shape[0], 1]) 29 | logits_q_a = np.reshape(logits_q.astype(np.float64), [1, logits_q.shape[0]]) 30 | logits_q_a = logits_q_a * 0.5 + 0.5 * logits_p_a 31 | KL_mtx_a = (logits_p_a - logits_q_a) * (0.5 + 0.5*np.tanh(logits_p_a/2.)) + np.maximum(0., logits_q_a) + np.log(1. + np.exp(-abs(logits_q_a))) - np.maximum(0., logits_p_a) - np.log(1. + np.exp(-abs(logits_p_a))) 32 | 33 | logits_p_b = np.reshape(logits_p.astype(np.float64), [1, logits_p.shape[0]]) 34 | logits_q_b = np.reshape(logits_q.astype(np.float64), [logits_q.shape[0], 1]) 35 | logits_p_b = logits_q_b * 0.5 + 0.5 * logits_p_b 36 | KL_mtx_b = (logits_q_b - logits_p_b) * (0.5 + 0.5*np.tanh(logits_q_b/2.)) + np.maximum(0., logits_p_b) + np.log(1. + np.exp(-abs(logits_p_b))) - np.maximum(0., logits_q_b) - np.log(1. + np.exp(-abs(logits_q_b))) 37 | return KL_mtx_a * 0.5 + KL_mtx_b.transpose()*0.5 38 | 39 | 40 | 41 | 42 | def kmeans_pp_init(X, n_clusters, random_state, n_local_trials=None, mode = 'jsd'): 43 | """Init n_clusters seeds according to k-means++ 44 | 45 | Parameters 46 | ---------- 47 | X : array or sparse matrix, shape (n_samples, n_features) 48 | The data to pick seeds for. To avoid memory copy, the input data 49 | should be double precision (dtype=np.float64). 50 | 51 | n_clusters : integer 52 | The number of seeds to choose 53 | 54 | x_squared_norms : array, shape (n_samples,) 55 | Squared Euclidean norm of each data point. 56 | 57 | random_state : int, RandomState instance 58 | The generator used to initialize the centers. Use an int to make the 59 | randomness deterministic. 60 | See :term:`Glossary `. 61 | 62 | n_local_trials : integer, optional 63 | The number of seeding trials for each center (except the first), 64 | of which the one reducing inertia the most is greedily chosen. 65 | Set to None to make the number of trials depend logarithmically 66 | on the number of seeds (2+log(k)); this is the default. 67 | 68 | Notes 69 | ----- 70 | Selects initial cluster centers for k-mean clustering in a smart way 71 | to speed up convergence. see: Arthur, D. and Vassilvitskii, S. 72 | "k-means++: the advantages of careful seeding". ACM-SIAM symposium 73 | on Discrete algorithms. 2007 74 | 75 | Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, 76 | which is the implementation used in the aforementioned paper. 77 | """ 78 | n_samples, n_features = X.shape 79 | random_state = np.random.RandomState(random_state) 80 | centers = np.empty((n_clusters, n_features), dtype=X.dtype) 81 | center_ids = np.empty((n_clusters,), dtype=np.int64) 82 | 83 | #assert x_squared_norms is not None, 'x_squared_norms None in _k_init' 84 | x_squared_norms = row_norms(X, squared=True) 85 | # Set the number of local seeding trials if none is given 86 | if n_local_trials is None: 87 | # This is what Arthur/Vassilvitskii tried, but did not report 88 | # specific results for other than mentioning in the conclusion 89 | # that it helped. 90 | n_local_trials = 2 + int(np.log(n_clusters)) 91 | 92 | # Pick first center randomly 93 | center_id = random_state.randint(n_samples) 94 | #test_id = random_state.randint(n_samples) 95 | #assert test_id != center_id: 96 | center_ids[0] = center_id 97 | if sp.issparse(X): 98 | centers[0] = X[center_id].toarray() 99 | else: 100 | centers[0] = X[center_id] 101 | 102 | # Initialize list of closest distances and calculate current potential 103 | if mode == 'euclidean': 104 | closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True) 105 | elif mode == 'kl': 106 | #def KL_div(logits_p, logits_q): 107 | # assert logits_p.shape[1] == 1 or logits_q.shape[1] == 1 108 | # return (logits_p - logits_q) * (np.tanh(logits_p/2.) * 0.5 + 0.5) + np.maximum(logits_q, 0.) + np.log(1.+np.exp(-abs(logits_q))) + np.maximum(logits_p, 0.) + np.log(1.+np.exp(-abs(logits_p))) 109 | closest_dist_sq = KL_mtx(X[:,0], centers[0]).transpose() 110 | elif mode == 'ce': 111 | closest_dist_sq = CE_mtx(X[:,0], centers[0]).transpose() 112 | elif mode == 'jsd': 113 | closest_dist_sq = JSD_mtx(X[:,0], centers[0]).transpose() 114 | else: 115 | raise ValueError("Unknown distance in Kmeans++ initialization") 116 | 117 | current_pot = closest_dist_sq.sum() 118 | 119 | # Pick the remaining n_clusters-1 points 120 | for c in range(1, n_clusters): 121 | # Choose center candidates by sampling with probability proportional 122 | # to the squared distance to the closest existing center 123 | rnd_samples = random_state.random_sample(n_local_trials) 124 | test1 = random_state.random_sample(n_local_trials) 125 | rand_vals = rnd_samples * current_pot 126 | assert np.any(abs(test1 - rnd_samples) > 1e-4) 127 | 128 | candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) 129 | # XXX: numerical imprecision can result in a candidate_id out of range 130 | np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) 131 | 132 | # Compute distances to center candidates 133 | if mode == 'euclidean': 134 | distance_to_candidates = euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) 135 | elif mode == 'ce': 136 | distance_to_candidates = CE_mtx(X[:,0], X[candidate_ids,0]).transpose() 137 | elif mode == 'kl': 138 | distance_to_candidates = KL_mtx(X[:,0], X[candidate_ids,0]).transpose() 139 | else: 140 | distance_to_candidates = JSD_mtx(X[:,0], X[candidate_ids,0]).transpose() 141 | # update closest distances squared and potential for each candidate 142 | np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) 143 | candidates_pot = distance_to_candidates.sum(axis=1) 144 | 145 | # Decide which candidate is the best 146 | best_candidate = np.argmin(candidates_pot) 147 | current_pot = candidates_pot[best_candidate] 148 | closest_dist_sq = distance_to_candidates[best_candidate] 149 | best_candidate = candidate_ids[best_candidate] 150 | center_ids[c] = best_candidate 151 | # Permanently add best center candidate found in local tries 152 | if sp.issparse(X): 153 | centers[c] = X[best_candidate].toarray() 154 | else: 155 | centers[c] = X[best_candidate] 156 | 157 | return centers, center_ids 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /imax_calib/evaluations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GavinKerrigan/conf_matrix_and_calibration/3b9cf13df58861a87549fe5d36017c1387aceb08/imax_calib/evaluations/__init__.py -------------------------------------------------------------------------------- /imax_calib/evaluations/calibration_metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 3 | # All rights reserved. 4 | ### 5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Author: Kanil Patel 20 | # -*- coding: utf-8 -*- 21 | ''' 22 | calibration_metrics.py 23 | evaluations 24 | 25 | Created by Kanil Patel on 07/27/20. 26 | Copyright 2020. Kanil Patel. All rights reserved. 27 | ''' 28 | import numpy as np 29 | import imax_calib.hb_utils as hb_utils 30 | import imax_calib.utils as utils 31 | import imax_calib.io as io 32 | from scipy.cluster.vq import kmeans,vq 33 | import scipy.cluster.vq 34 | import os 35 | import contextlib 36 | 37 | from imax_calib.calibrators.binners import run_imax 38 | 39 | 40 | 41 | def compute_top_1_and_CW_ECEs(multi_cls_probs, multi_cls_labels, list_approximators=["dECE", "mECE", "iECE", "kECE"], num_bins=100, threshold_mode='class'): 42 | """ 43 | Given the multi-class predictions and labels, this function computes the top1 and CW ECEs. Will compute it by calling the other functions in this script. 44 | 45 | Parameters: 46 | ----------- 47 | multi_cls_probs: 2D ndarray 48 | predicted probabilities 49 | multi_cls_labels: 1D or 2D ndarray 50 | label indices or one-hot labels. Will be converted to one-hot 51 | 52 | Return: 53 | ------- 54 | ece_dict: dict 55 | Dictionary with all the ECE estimates 56 | 57 | """ 58 | assert len(multi_cls_probs.shape)==2 59 | if len(multi_cls_labels.shape)==1: # not one-hot. so convert to one-hot 60 | multi_cls_labels = np.eye(multi_cls_probs.shape[1])[multi_cls_labels] 61 | 62 | ece_evals_dict = io.AttrDict({}) 63 | 64 | n_classes = multi_cls_probs.shape[1] 65 | for ece_approx in list_approximators: 66 | top_1_preds = multi_cls_probs.max(axis=-1) 67 | top_1_correct=multi_cls_probs.argmax(axis=-1) == multi_cls_labels.argmax(axis=-1) 68 | 69 | top_1_ECE = eval("measure_%s_calibration"%(ece_approx))(pred_probs=top_1_preds, correct=top_1_correct, num_bins=num_bins)["ece"] 70 | 71 | cw_ECEs = [] 72 | if threshold_mode == 'class': 73 | threshold = 1.0/n_classes 74 | elif threshold_mode is None: 75 | threshold = 0. 76 | for class_idx in range(n_classes): 77 | cw_ECE = eval("measure_%s_calibration"%(ece_approx))(pred_probs=multi_cls_probs[:, class_idx], 78 | correct=multi_cls_labels[:, class_idx], 79 | num_bins=num_bins, threshold=threshold)["ece"] 80 | cw_ECEs.append(cw_ECE) 81 | mean_cw_ECE = np.mean(cw_ECEs) 82 | 83 | ece_evals_dict["top_1_%s"%(ece_approx)] = top_1_ECE 84 | ece_evals_dict["cw_%s"%(ece_approx)] = mean_cw_ECE 85 | 86 | return ece_evals_dict 87 | 88 | 89 | def _ece(avg_confs, avg_accs, counts): 90 | """ 91 | Helper function to compute the Expected Calibration Error. 92 | 93 | Parameters 94 | ---------- 95 | avg_confs: Averaged probability of predictions per bin (confidence) 96 | avg_accs: Averaged true accuracy of predictions per bin 97 | counts: Number of predictions per bin 98 | 99 | Returns 100 | ------- 101 | ece: float - calibration error 102 | """ 103 | return np.sum((counts / counts.sum()) * np.absolute(avg_confs- avg_accs)) 104 | 105 | 106 | def measure_iECE_calibration(pred_probs, correct, num_bins, threshold=-1): 107 | """ 108 | Compute the calibration curve using I-Max binning scheme. This will run the I-Max algorithm on the TEST set and get the bin boundaries. 109 | 110 | Parameters 111 | ---------- 112 | y: numpy binary array 113 | label indicating if sample is positive or negative 114 | 115 | for rest see calibration_error_and_curve() 116 | 117 | Returns 118 | ------- 119 | see calibration_error_and_curve() 120 | 121 | """ 122 | #print("Running iECE calc.: calling I-Max now!") 123 | logodds = utils.to_logodds(pred_probs) 124 | with open(os.devnull, "w") as f, contextlib.redirect_stdout(f), contextlib.redirect_stderr(f): 125 | logdata = run_imax(logodds, correct, num_bins, log_every_steps=None, logfpath=None ) 126 | bin_boundaries = logdata.bin_boundaries[-1] 127 | assigned = hb_utils.bin_data(logodds, bin_boundaries) 128 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold) 129 | 130 | def measure_dECE_calibration(pred_probs, correct, num_bins=100, threshold=-1): 131 | """ 132 | Compute the calibration curve using the equal size binning scheme (i.e. equal size bins)and computes the calibration error given this binning scheme (i.e. dECE). 133 | 134 | Parameters 135 | ---------- 136 | see calibration_error_and_curve() 137 | Returns 138 | ------- 139 | see calibration_error_and_curve() 140 | 141 | """ 142 | assert len(pred_probs.shape)==1 143 | bin_boundaries_prob = utils.to_sigmoid( hb_utils.nolearn_bin_boundaries(num_bins, binning_scheme="eqsize") ) 144 | assigned = hb_utils.bin_data(pred_probs, bin_boundaries_prob) 145 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold) 146 | 147 | 148 | def measure_mECE_calibration(pred_probs, correct, num_bins=100, threshold=-1): 149 | """ 150 | Compute the calibration curve using the equal mass binning scheme (i.e. equal mass bins)and computes the calibration error given this binning scheme (i.e. mECE). 151 | 152 | Parameters 153 | ---------- 154 | see calibration_error_and_curve() 155 | Returns 156 | ------- 157 | see calibration_error_and_curve() 158 | """ 159 | assert len(pred_probs.shape)==1 160 | logodds = utils.to_logodds(pred_probs) 161 | #if logodds.max()<=1 and logodds.min()>=0: 162 | bin_boundaries_prob = utils.to_sigmoid( hb_utils.nolearn_bin_boundaries(num_bins, binning_scheme="eqmass", x=logodds) ) 163 | assigned = hb_utils.bin_data(pred_probs, bin_boundaries_prob) 164 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold) 165 | 166 | def measure_kECE_calibration(pred_probs, correct, num_bins=100, threshold=-1): 167 | """ 168 | Compute the calibration curve using the kmeans binning scheme (i.e. use kmeans to cluster the data and then determine the bin assignments) and computes the calibration error given this binning scheme (i.e. kECE). 169 | 170 | Parameters 171 | ---------- 172 | see calibration_error_and_curve() 173 | Returns 174 | ------- 175 | see calibration_error_and_curve() 176 | """ 177 | 178 | assert len(pred_probs.shape)==1 179 | centroids,_ = scipy.cluster.vq.kmeans(pred_probs, num_bins) 180 | cluster_ids, _ = scipy.cluster.vq.vq(pred_probs, centroids) 181 | cluster_ids = cluster_ids.astype(np.int) 182 | return calibration_error_and_curve(pred_probs, correct, cluster_ids, num_bins, threshold) 183 | 184 | 185 | def measure_quantized_calibration(pred_probs, correct, assigned, num_bins=100, threshold=-1): 186 | """ 187 | Compute the calibration curve given the bin assignments (i.e. quantized values). 188 | """ 189 | assert len(pred_probs.shape)==1 190 | return calibration_error_and_curve(pred_probs, correct, assigned, num_bins, threshold) 191 | 192 | 193 | def calibration_error_and_curve(pred_probs, correct, assigned, num_bins=100, threshold=-1): 194 | """ 195 | Compute the calibration curve and calibration error. The threshold float will determine which samples to ignore because its confidence is very low. 196 | 197 | Parameters 198 | ---------- 199 | see calibration_curve_quantized() 200 | 201 | Returns 202 | ------- 203 | results: dict 204 | dictionary with calibration information 205 | """ 206 | assert len(pred_probs.shape)==1 207 | mask = pred_probs>threshold 208 | pred_probs, correct, assigned = pred_probs[mask], correct[mask], assigned[mask] 209 | cov = mask.mean() 210 | prob_pred, prob_true, counts, counts_unfilt = calibration_curve_quantized(pred_probs, correct, assigned=assigned, num_bins=num_bins) 211 | ece = _ece(prob_pred, prob_true, counts) 212 | return {"ece": ece, "prob_pred":prob_pred, "prob_true":prob_true, "counts":counts, "counts_unfilt":counts_unfilt, "threshold":threshold, "cov":cov} 213 | 214 | 215 | def calibration_curve_quantized(pred_probs, correct, assigned, num_bins=100): 216 | """ 217 | Get the calibration curve given the bin assignments, samples and sample-correctness. 218 | 219 | Parameters 220 | ---------- 221 | pred_probs: numpy ndarray 222 | numpy array with predicted probabilities (i.e. confidences) 223 | correct: numpy ndarray 224 | 0/1 indicating if the sample was correctly classified or not 225 | num_bins: int 226 | number of bins for quantization 227 | Returns 228 | ------- 229 | prob_pred: for each bin the avg. confidence 230 | prob_true: for each bin the avg. accuracy 231 | counts: number of samples in each bin 232 | counts_unfilt: same as `counts` but also including zero bins 233 | """ 234 | assert len(pred_probs.shape)==1 235 | bin_sums_pred = np.bincount(assigned, weights=pred_probs, minlength=num_bins) 236 | bin_sums_true = np.bincount(assigned, weights=correct, minlength=num_bins) 237 | counts = np.bincount(assigned, minlength=num_bins) 238 | filt = counts > 0 239 | prob_pred = (bin_sums_pred[filt] / counts[filt]) 240 | prob_true = (bin_sums_true[filt] / counts[filt]) 241 | counts_unfilt = counts 242 | counts = counts[filt] 243 | return prob_pred, prob_true, counts, counts_unfilt 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /imax_calib/hb_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 3 | # All rights reserved. 4 | ## 5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Author: Kanil Patel 20 | # -*- coding: utf-8 -*- 21 | ''' 22 | hb_utils.py 23 | imax_calib 24 | 25 | Contains all util functions for any histogram binning (hb) operations. 26 | 27 | Created by Kanil Patel on 07/27/20. 28 | Copyright 2020. Kanil Patel. All rights reserved. 29 | ''' 30 | import numpy as np 31 | import scipy; import scipy.stats; import scipy.integrate as integrate 32 | import imax_calib.utils as utils 33 | import imax_calib.io as io 34 | 35 | ################## 36 | # Binning utils 37 | ################# 38 | def nolearn_bin_boundaries(num_bins, binning_scheme, x=None): 39 | """ 40 | Get the bin boundaries (in logit space) of the bins. This function returns only the bin boundaries which do not include any type of learning. 41 | For example: equal mass bins, equal size bins or overlap bins. 42 | 43 | Parameters 44 | ---------- 45 | num_bins: int 46 | Number of bins 47 | binning_scheme: string 48 | The way the bins should be placed. 49 | 'eqmass': each bin has the same portion of samples assigned to it. Requires that `x is not None`. 50 | 'eqsize': equal spaced bins in `probability` space. Will get equal spaced bins in range [0,1] and then convert to logodds. 51 | 'custom_range[min_lambda,max_lambda]': equal spaced bins in `logit` space given some custom range. 52 | x: numpy array (1D,) 53 | array with the 1D data to determine the eqmass bins. 54 | 55 | Returns 56 | ------- 57 | bins: numpy array (num_bins-1,) 58 | Returns the bin boundaries. It will return num_bins-1 bin boundaries in logit space. Open ended range on both sides. 59 | """ 60 | if binning_scheme=="eqmass": 61 | assert x is not None and len(x.shape)==1 62 | bins = np.linspace(1.0/num_bins, 1 - 1.0 / num_bins, num_bins-1) # num_bins-1 boundaries for open ended sides 63 | bins = np.percentile(x, bins * 100, interpolation='lower') # data will ensure its in Logit space 64 | elif binning_scheme=="eqsize": # equal spacing in logit space is not the same in prob space because of sigmoid non-linear transformation 65 | bins = utils.to_logodds( np.linspace(1.0/num_bins, 1 - 1.0 / num_bins, num_bins-1) ) # num_bins-1 boundaries for open ended sides 66 | elif "custom_range" in binning_scheme: # used for example when you want bins at overlap regions. then custom range should be [ min p(y=1), max p(y=0) ]. e.g. custom_range[-5,8] 67 | custom_range = eval(binning_scheme.replace("custom_range", "")) 68 | assert type(custom_range)==list and (custom_range[0] <= custom_range[1]) 69 | bins = np.linspace(custom_range[0], custom_range[1], num_bins-1) # num_bins-1 boundaries for open ended sides 70 | return bins 71 | 72 | def bin_data(x, bins): 73 | """ 74 | Given bin boundaries quantize the data (x). When ndims(x)>1 it will flatten the data, quantize and then reshape back to orig shape. 75 | Returns the following quantized values for num_bins=10 and bins = [2.5, 5.0, 7.5, 1.0]\n 76 | quantize: \n 77 | (-inf, 2.5) -> 0\n 78 | [2.5, 5.0) -> 1\n 79 | [5.0, 7.5) -> 2\n 80 | [7.5, 1.0) -> 3\n 81 | [1.0, inf) -> 4\n 82 | 83 | Parameters 84 | ---------- 85 | x: numpy ndarray 86 | Network logits as numpy array 87 | bins: numpy ndarray 88 | location of the (num_bins-1) bin boundaries 89 | 90 | Returns 91 | ------- 92 | assigned: int numpy ndarray 93 | For each sample, this contains the bin id (0-indexed) to which the sample belongs. 94 | """ 95 | orig_shape = x.shape 96 | # if not 1D data. so need to reshape data, then quantize, then reshape back 97 | if len(orig_shape)>1 or orig_shape[-1]!=1: x = x.flatten() 98 | assigned = np.digitize(x, bins) # bin each input in data. np.digitize will always return a valid index between 0 and num_bins-1 whenever bins has length (num_bins-1) to cater for the open range on both sides 99 | if len(orig_shape)>1 or orig_shape[-1]!=1: assigned = np.reshape(assigned, orig_shape) 100 | return assigned.astype(np.int) 101 | 102 | 103 | 104 | ######### Quantize data 105 | def quantize_logodds(x, bins, bin_reprs, return_probs=True): 106 | """ 107 | Quantize logodds (x) using bin boundaries (bins) and reprs in logit space and then convert to prob space if `return_probs=True`. 108 | 109 | Parameters 110 | ---------- 111 | x: numpy ndarray 112 | Network logits as numpy array 113 | bins: numpy ndarray 114 | Location of the (num_bins-1) bin boundaries 115 | bin_reprs: numpy ndarray 116 | Bin representations in logodds space. Contains (num_bins-1)=len(bins)+1 entries. 117 | return_probs: boolean (default: True) 118 | All operations take place in logodds space. Setting this to true will ensure that the values returned are in probability space (i.e. it will convert the quantized values from logodds to sigmoid before returning them) 119 | 120 | Returns 121 | ------- 122 | quant_output: numpy ndarray 123 | The output of the quantization based on the bins and bin_reprs. Either the output will be in logodds space (i.e. return_probs=False) or in probability space. 124 | assigned: int numpy ndarray 125 | The bin assignment integers for each sample. 126 | """ 127 | assigned = bin_data(x, bins) # log space 128 | quant_output = bin_reprs[assigned] # fill up representations based on assignments 129 | if return_probs: quant_output = utils.to_sigmoid(quant_output) # prob space 130 | return pred_probs, assigned 131 | 132 | 133 | ########### Bin boundary update 134 | def bin_boundary_update_closed_form(representations): 135 | """ 136 | Closed form update of boundaries. stationary point when log(p(y=1|lambda)) - log(p(y=0|lambda)) = log(log(xxx)/log(xxx)) term. LHS side is logodds/boundaries when p(y|lambda) modelled with sigmoid (e.g. PPB ) 137 | """ 138 | temp_log = 1. + np.exp(-1*np.abs(representations)) 139 | temp_log[temp_log==0] = utils.EPS 140 | logphi_a = np.maximum(0., representations) + np.log(temp_log) 141 | logphi_b = np.maximum(0., -1*representations) + np.log(temp_log) 142 | assert np.any(np.sign(logphi_a[1:]-logphi_a[:-1])*np.sign(logphi_b[:-1]-logphi_b[1:])>=0.) 143 | temp_log1 = np.abs( logphi_a[1:] - logphi_a[:-1] ) 144 | temp_log2 = np.abs( logphi_b[:-1] - logphi_b[1:] ) 145 | temp_log1[temp_log1==0] = utils.EPS 146 | temp_log2[temp_log2==0] = utils.EPS 147 | bin_boundaries = np.log(temp_log1) - np.log(temp_log2) 148 | bin_boundaries = np.sort(bin_boundaries) 149 | return bin_boundaries 150 | 151 | 152 | 153 | 154 | ######### Bin representation code 155 | def bin_representation_calculation(x, y, num_bins, bin_repr_scheme="sample_based", bin_boundaries=None, assigned=None, return_probs=False): 156 | """ 157 | Bin representations: frequency based: num_positive_samples/num_total_samples in each bin. 158 | or pred_prob based: average of the sigmoid of lambda 159 | Function gets the bin representation which can be used during the MI maximization. 160 | 161 | Parameters 162 | ---------- 163 | x: numpy ndarray 164 | logodds data which needs to be binned using bin_boundaries. Only needed if assigned not given. 165 | y: numpy ndarray 166 | Binary label for each sample 167 | bin_repr_scheme: strig 168 | scheme to use to determine bin reprs. options: 'sample_based' and 'pred_prob_based' 169 | bin_boundaries: numpy array 170 | logodds bin boundaries. Only needed when assigned is not given. 171 | assigned: int numpy array 172 | bin id assignments for each sample 173 | 174 | Returns 175 | ------- 176 | quant_reprs: numpy array 177 | quantized bin reprs for each sample 178 | 179 | """ 180 | assert (bin_boundaries is None) != (assigned is None), "Cant have or not have both arguments. Need exactly one of them." 181 | if assigned is None: assigned = bin_data(x, bin_boundaries) 182 | 183 | if bin_repr_scheme=="sample_based": 184 | quant_reprs = bin_repr_unknown_LLR(y, assigned, num_bins, return_probs) # frequency estimate of correct/incorrect 185 | elif bin_repr_scheme=="pred_prob_based": 186 | quant_reprs = bin_repr_unknown_LLR(utils.to_sigmoid(x), assigned, num_bins, return_probs) # softmax probability for bin reprs 187 | else: 188 | raise Exception("bin_repr_scheme=%s is not valid."%(bin_repr_scheme)) 189 | return quant_reprs 190 | 191 | def bin_repr_unknown_LLR(sample_weights, assigned, num_bins, return_probs=False): 192 | """ 193 | Unknown Bin reprs. Will take the average of the either the pred_probs or the binary labels. 194 | Determines the bin reprs by taking average of sample weights in each bin. 195 | For example for sample-based repr: sample_weights should be 0 or 1 indicating correctly classified or not. 196 | or for pred-probs-based repr: sample_weights should be the softmax output probabilities. 197 | Handles reshaping if sample_weights or assigned has more than 1 dim. 198 | 199 | Parameters 200 | ---------- 201 | sample_weights: numpy ndarray 202 | array with the weight of each sample. These weights are used to calculate the bin representation by taking the averages of samples grouped together. 203 | assigned: int numpy array 204 | array with the bin ids of each sample 205 | return_probs: boolean (default: True) 206 | All operations take place in logodds space. Setting this to true will ensure that the values returned are in probability space (i.e. it will convert the quantized values from logodds to sigmoid before returning them) 207 | 208 | Returns 209 | ------- 210 | representations: numpy ndarray 211 | representations of each sample based on the bin it was assigned to 212 | """ 213 | orig_shape = sample_weights.shape 214 | assert np.all(orig_shape==assigned.shape) 215 | assert sample_weights.max()<=1.0 and sample_weights.min()>=0.0, "make sure sample weights are probabilities" 216 | if len(orig_shape)>1: 217 | sample_weights = sample_weights.flatten() 218 | assigned = assigned.flatten() 219 | 220 | bin_sums_pos = np.bincount(assigned, weights=sample_weights, minlength=num_bins) # sum up all positive samples 221 | counts = np.bincount(assigned, minlength=num_bins) # sum up all samples in bin 222 | filt = counts>0 223 | prob_pos = np.ones(num_bins)*sample_weights.mean() # NOTE: important change: when no samples at all fall into any bin then default should be the prior 224 | prob_pos[filt] = bin_sums_pos[filt] / counts[filt] # get safe prob of pos samples over all samples 225 | representations = prob_pos 226 | if return_probs==False: representations = utils.to_logodds( representations)#NOTE: converting to logit domain again 227 | return representations 228 | 229 | def bin_repr_known_LLR(bin_boundaries, prior_y_pos, distr_kde_dict): 230 | """ 231 | Known Bin reprs (i.e. density based representation). Will get the bin representations based on the density estimated by KDE. 232 | Much slower than unknown LLR. so only used when calculating the MI. 233 | 234 | Parameters 235 | ---------- 236 | logodds: numpy ndarray 237 | data which will be used to estimate the KDE 238 | y: numpy ndarray 239 | labels of the samples also used to get the positive and negative KDEs 240 | assigned: int numpy array 241 | array with the bin ids of each sample 242 | return_probs: boolean (default: True) 243 | All operations take place in logodds space. Setting this to true will ensure that the values returned are in probability space (i.e. it will convert the quantized values from logodds to sigmoid before returning them) 244 | 245 | Returns 246 | ------- 247 | representations: numpy ndarray 248 | representations of each sample based on the bin it was assigned to 249 | """ 250 | distr_pos = distr_kde_dict["pos"] # scipy.stats.gaussian_kde(logodds[y==1]) 251 | distr_neg = distr_kde_dict["neg"] # scipy.stats.gaussian_kde(logodds[y==0]) 252 | prior_y_neg = 1 - prior_y_pos 253 | new_boundaries = np.hstack([-100, bin_boundaries , 100]) 254 | new_reprs = np.zeros(len(bin_boundaries)+1) 255 | 256 | p_ypos_given_lam = np.zeros( len(bin_boundaries)+1 ) 257 | p_yneg_given_lam = np.zeros( len(bin_boundaries)+1 ) 258 | for idx in range( len(bin_boundaries) + 1): 259 | numer = prior_y_pos*distr_pos.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=1)*p(y=1) 260 | denom = prior_y_neg*distr_neg.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=0)*p(y=0) 261 | new_reprs[idx] = utils.safe_log_diff(numer, denom, np.log) 262 | p_ypos_given_lam[idx] = numer 263 | p_yneg_given_lam[idx] = denom 264 | new_reprs[~np.isfinite(new_reprs)] = utils.EPS 265 | new_reprs[new_reprs==0] = utils.EPS 266 | return new_reprs, p_ypos_given_lam, p_yneg_given_lam 267 | 268 | 269 | 270 | 271 | 272 | 273 | def MI_unknown_LLR(p_y_pos, logodds, bin_boundaries, representations): 274 | """logodds => the logodds which were used to bin. rewrote MI loss: sum_Y sum_B p(y'|lambda)p(lambda) for term outside log. Before it was p(lambda|y')p(y') """ 275 | # NOTE: checked and matches impl of Dan: -1*MI_eval(**kwargs) => all good 276 | pred_probs = utils.to_sigmoid(logodds) 277 | prior_y = io.AttrDict( dict(pos=p_y_pos, neg=1-p_y_pos) ) 278 | num_bins = len(bin_boundaries)+1 279 | # get p(y|lambda)p(lambda).... first get mean pred. prob. per bin 280 | assigned = bin_data(logodds, bin_boundaries) 281 | bin_sums_pred_probs_pos = np.bincount( assigned, weights=pred_probs, minlength=num_bins) # get the reprs in prob space because of mean. 282 | p_y_pos_given_lambda_per_bin = bin_sums_pred_probs_pos / logodds.shape[0] 283 | bin_sums_pred_probs_neg = np.bincount( assigned, weights=1-pred_probs, minlength=num_bins) # get the reprs in prob space because of mean. 284 | p_y_neg_given_lambda_per_bin = bin_sums_pred_probs_neg / logodds.shape[0] 285 | p_y_given_lambda_dict = io.AttrDict(dict(pos=p_y_pos_given_lambda_per_bin, neg=p_y_neg_given_lambda_per_bin)) 286 | mi_loss = 0.0 287 | for binary_class_str, binary_class in zip( ["neg","pos"], [0,1] ): 288 | terms_in_log = ( 1 + np.exp((1-2*binary_class) * representations) ) * prior_y[binary_class_str] # part 3 289 | bin_summation_term = np.sum( p_y_given_lambda_dict[binary_class_str] * np.log( terms_in_log ) ) 290 | mi_loss += bin_summation_term 291 | return -1*mi_loss 292 | 293 | 294 | 295 | 296 | def MI_known_LLR(bin_boundaries, p_y_pos, distr_kde_dict): 297 | """ 298 | Calculate the MI(lambda_hat, y)(using the known LLR), where lambda_hat is the quantized lambdas. 299 | This will compute the MI in bits (log2). 300 | It uses a KDE to estimate the density of the positive and negative samples. 301 | At the end it will perform some basic checks to see if the computations were correct. 302 | In addition to the MI it will compute the bit rate (R) (i.e. MI(z, lambda) where z is quantized lambda) 303 | 304 | 305 | Parameters 306 | ---------- 307 | bin_boundaries: numpy array 308 | bin boundaries 309 | p_y_pos: float 310 | p(y=1) prior 311 | distr_kde_dict: dict 312 | dictionary containing the KDE objects used to estimate the density in each bin with keys 'pos' and 'neg'. 313 | 314 | Returns 315 | ------- 316 | MI: float 317 | MI(z, y) where z is quantized lambda. This is the mutual information between the quantizer output to the label. 318 | R: float 319 | bin rate. This is MI(z, lambda). Mutual Information between lambda and quantized lambda. 320 | """ 321 | distr_pos, distr_neg = distr_kde_dict["pos"], distr_kde_dict["neg"] 322 | p_y_neg = 1 - p_y_pos 323 | 324 | 325 | new_boundaries = np.hstack([-100, bin_boundaries, 100]) 326 | # lists for checks afterwards 327 | all_vs, all_intpos, all_intneg = [], [], [] 328 | MI, R = 0.0, 0.0 329 | for idx in range( len(bin_boundaries) + 1): 330 | integral_pos = p_y_pos*distr_pos.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=1)*p(y=1) = p(lam|y=1) 331 | integral_neg = p_y_neg*distr_neg.integrate_box_1d(new_boundaries[idx], new_boundaries[idx+1]) # p(lam|y=1)*p(y=1) = p(lam|y=0) 332 | repr = utils.safe_log_diff(integral_pos, integral_neg, np.log) 333 | 334 | p_ypos_given_z = max( utils.EPS, utils.to_sigmoid( repr) ) 335 | p_yneg_given_z = max( utils.EPS, utils.to_sigmoid(-1*repr) ) 336 | 337 | curr_MI_pos = integral_pos * ( utils.safe_log_diff( p_ypos_given_z, p_y_pos, np.log2 ) ) 338 | curr_MI_neg = integral_neg * ( utils.safe_log_diff( p_yneg_given_z, p_y_neg, np.log2 ) ) 339 | MI += curr_MI_pos + curr_MI_neg 340 | 341 | v = max( utils.EPS, (integral_pos + integral_neg) ) 342 | curr_R = -1 * v * np.log2(v) # entropy of p(z) = p(z|y=1)p(y=1) + p(z|y=0)p(y=0) 343 | R += curr_R 344 | # gather for checks 345 | all_vs.append(v) 346 | all_intpos.append(integral_pos); all_intneg.append(integral_neg) 347 | np.testing.assert_almost_equal( np.sum(all_vs), 1.0 , decimal=1) 348 | np.testing.assert_almost_equal( np.sum(all_intpos), p_y_pos, decimal=1) 349 | np.testing.assert_almost_equal( np.sum(all_intneg), p_y_neg, decimal=1) 350 | return MI, R 351 | 352 | 353 | def MI_upper_bounds(p_y_pos, distr_kde_dict): 354 | """ 355 | Calculate the MI upper bound of MI(z, y) <= MI(lambda, y). As z is the quantized version of lambda, MI(z, y) is upper bounded by MI(lambda, y). 356 | This is a tigther bound than H(y). This function will return both upper bounds. 357 | 358 | Bound 1: MI(z, y) <= H(y) - H(y|z) <= H(y) 359 | Bound 2: MI(z, y) <= MI(lambda, y) 360 | 361 | Parameters 362 | ---------- 363 | p_y_pos: float 364 | p(y=1) prior 365 | distr_kde_dict: dict 366 | dictionary containing the KDE objects used to estimate the density in each bin with keys 'pos' and 'neg'. 367 | 368 | Returns 369 | ------- 370 | H_y: float 371 | Loose upper bound which is H(y) 372 | MI_y_lambda: float 373 | Upper bound of MI(z, y) which is upper bounded by MI(lambda, y). Tigther bound than H(y) 374 | 375 | """ 376 | tic = io.time.time() 377 | p_y_neg = 1 - p_y_pos 378 | 379 | # Bound 1 380 | H_y = -1*p_y_pos*np.log2(p_y_pos) + -1*p_y_neg*np.log2(p_y_neg) 381 | 382 | # Bound 2 383 | distr_pos, distr_neg = distr_kde_dict["pos"], distr_kde_dict["neg"] 384 | def get_logodd_lambda(lam): 385 | log_term_1 = p_y_pos * distr_pos.pdf(lam) 386 | log_term_2 = p_y_neg * distr_neg.pdf(lam) 387 | logodd_lambda = utils.safe_log_diff(log_term_1, log_term_2, np.log) 388 | return logodd_lambda 389 | 390 | def integral_pos(lam): 391 | logodd_lambda = get_logodd_lambda(lam) 392 | p_ypos_lambda = utils.to_sigmoid( logodd_lambda ) 393 | return p_y_pos * distr_pos.pdf(lam) * utils.safe_log_diff( p_ypos_lambda, p_y_pos, np.log2) #np.log2( p_ypos_lambda / p_y_pos ) 394 | 395 | def integral_neg(lam): 396 | logodd_lambda = get_logodd_lambda(lam) 397 | p_yneg_lambda = utils.to_sigmoid( -1* logodd_lambda ) 398 | return p_y_neg * distr_neg.pdf(lam) * utils.safe_log_diff(p_yneg_lambda, p_y_neg, np.log2) #np.log2( p_yneg_lambda / p_y_neg ) 399 | 400 | term_pos = integrate.quad(integral_pos, -100, 100, limit=100)[0] 401 | term_neg = integrate.quad(integral_neg, -100, 100, limit=100)[0] 402 | MI_y_lambda = term_pos + term_neg 403 | 404 | 405 | toc = io.time.time() 406 | print("Time elapsed: upper bound computation: ", (toc-tic), " seconds!") 407 | return H_y, MI_y_lambda 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | -------------------------------------------------------------------------------- /imax_calib/io.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 3 | # All rights reserved. 4 | ## 5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Author: Kanil Patel 20 | # -*- coding: utf-8 -*- 21 | """ 22 | Created on Tue Mar 20 10:03:33 2018 23 | 24 | @author: pak2rng 25 | """ 26 | import os 27 | import numpy as np 28 | from attrdict import AttrDict 29 | import deepdish 30 | import time 31 | 32 | def deepdish_read(fpath, group=None): 33 | ''' Read all data inside the hdf5 file ''' 34 | data = deepdish.io.load(fpath, group=group) 35 | if isinstance(data, dict): 36 | data = AttrDict(data) 37 | return data 38 | 39 | def deepdish_write(fpath, data): 40 | ''' Save a dictionary as a hdf5 file! ''' 41 | create_dir_for_fpath(fpath) 42 | deepdish.io.save(fpath, data, compression="None") 43 | 44 | 45 | 46 | 47 | class Logger: 48 | def __init__(self, fpath): 49 | self.fpath = fpath 50 | self.logdata = AttrDict({}) 51 | 52 | def log(self, key, value): 53 | if key not in self.logdata: self.logdata[key] = [] 54 | self.logdata[key].append(value) 55 | 56 | def last(self, key): 57 | return self.logdata[key][-1] 58 | 59 | def log_dict(self, dictionary, suffix=""): 60 | # logging each element in the dictionary 61 | suffix = "_%s"%(suffix) if (suffix != "" and suffix[0]!="_") else suffix 62 | for k,v in dictionary.items(): 63 | self.log(k+suffix,v) 64 | 65 | 66 | def end_log(self): 67 | for k,v in self.logdata.items(): 68 | self.logdata[k] = np.array(v) if isinstance(v, list) else v 69 | 70 | def save_log(self): 71 | deepdish_write(self.fpath, self.logdata) 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /imax_calib/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | # Copyright (c) 2021 Robert Bosch GmbH Copyright holder of the paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 3 | # All rights reserved. 4 | ### 5 | # The paper "Multi-Class Uncertainty Calibration via Mutual Information Maximization-based Binning" accepted at ICLR 2021. 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Author: Kanil Patel 20 | # -*- coding: utf-8 -*- 21 | ''' 22 | utils.py 23 | imax_calib 24 | 25 | Created by Kanil Patel on 07/06/20. 26 | Copyright 2020. Kanil Patel. All rights reserved. 27 | ''' 28 | import numpy as np 29 | import sklearn.model_selection 30 | 31 | #EPS = np.finfo(float).eps # used to avoid division by zero 32 | EPS = 1e-50 33 | 34 | def is_sorted(a): 35 | for i in range(a.size-1): 36 | if a[i+1] < a[i] : return False 37 | return True 38 | 39 | 40 | def to_softmax(x, axis=-1): 41 | """ 42 | Stable softmax in numpy. Will be applied across last dimension by default. 43 | Takes care of numerical instabilities like division by zero or log(0). 44 | 45 | Parameters 46 | ---------- 47 | x : numpy ndarray 48 | Logits of the network as numpy array. 49 | axis: int 50 | Dimension along which to apply the operation (default: last one) 51 | 52 | Returns 53 | ------- 54 | softmax: numpy ndarray 55 | Softmax output 56 | """ 57 | z = x - np.max(x, axis=axis, keepdims=True) 58 | numerator = np.exp(z) 59 | denominator = np.sum(numerator, axis=axis, keepdims=True) 60 | softmax = numerator / denominator 61 | assert np.all( np.isfinite(softmax) ) == True , "Softmax output contains NaNs. Handle this." 62 | return softmax 63 | 64 | def to_sigmoid(x): 65 | """ 66 | Stable sigmoid in numpy. Uses tanh for a more stable sigmoid function. 67 | 68 | Parameters 69 | ---------- 70 | x : numpy ndarray 71 | Logits of the network as numpy array. 72 | 73 | Returns 74 | ------- 75 | sigmoid: numpy ndarray 76 | Sigmoid output 77 | """ 78 | sigmoid = 0.5 + 0.5 * np.tanh(x/2) 79 | assert np.all( np.isfinite(sigmoid) ) == True , "Sigmoid output contains NaNs. Handle this." 80 | return sigmoid 81 | 82 | def to_logodds(x): 83 | """ 84 | 85 | Convert probabilities to logodds using: 86 | 87 | .. math:: 88 | \\log\\frac{p}{1-p} ~ \\text{where} ~ p \\in [0,1] 89 | 90 | Natural log. 91 | 92 | Parameters 93 | ---------- 94 | x : numpy ndarray 95 | Class probabilties as numpy array. 96 | 97 | Returns 98 | ------- 99 | logodds : numpy ndarray 100 | Logodds output 101 | 102 | """ 103 | x = np.clip(x, 1e-10, 1 - 1e-10) 104 | assert x.max() <= 1 and x.min() >= 0 105 | numerator = x 106 | denominator = 1-x 107 | #numerator[numerator==0] = EPS 108 | #denominator[denominator==0] = EPS # 1-EPS is basically 1 so not stable! 109 | logodds = safe_log_diff(numerator, denominator, np.log) # logodds = np.log( numerator/denominator ) 110 | assert np.all(np.isfinite(logodds))==True, "Logodds output contains NaNs. Handle this." 111 | return logodds 112 | 113 | def safe_log_diff(A, B, log_func=np.log): 114 | """ 115 | Numerically stable log difference function. Avoids log(0). Will compute log(A/B) safely where the log is determined by the log_func 116 | """ 117 | if np.isscalar(A): 118 | if A==0 and B==0: 119 | return log_func(EPS) 120 | elif A==0: 121 | return log_func( EPS ) - log_func(B) 122 | elif B==0: 123 | return log_func( A ) - log_func( EPS ) 124 | else: 125 | return log_func(A) - log_func(B) 126 | else: 127 | # log(A) - log(B) 128 | with np.errstate(divide='ignore'): 129 | output = np.where(A==0, log_func(EPS), log_func(A) ) - np.where(B==0, log_func(EPS), log_func(B)) 130 | output[ np.logical_or(A==0, B==0)] = log_func(EPS) 131 | assert np.all(np.isfinite(output)) 132 | return output 133 | 134 | 135 | 136 | 137 | def quick_logits_to_logodds(logits, probs=None): 138 | """ 139 | Using the log-sum-exp trick can be slow to convert from logits to logodds. This function will use the faster prob_to_logodds if n_classes is large. 140 | """ 141 | n_classes = logits.shape[-1] 142 | if n_classes <=100: # n_classes are reasonable as use this slow way to get marginal 143 | logodds = logits_to_logodds(logits) 144 | else: # imagenet case will always come here! 145 | if probs is None: probs = to_softmax(logits) 146 | logodds = probs_to_logodds(probs) 147 | return logodds 148 | 149 | def probs_to_logodds(x): 150 | """ 151 | Use probabilities to convert to logodds. Faster than logits_to_logodds. 152 | """ 153 | assert x.max() <= 1 and x.min() >= 0 154 | logodds = to_logodds(x) 155 | assert np.all(np.isfinite(logodds)) 156 | return logodds 157 | 158 | def logits_to_logodds(x): 159 | """ 160 | Convert network logits directly to logodds (without conversion to probabilities and then back to logodds) using: 161 | 162 | .. math:: 163 | \\lambda_k=z_k-\\log\\sum\\nolimits_{k'\\not = k}e^{z_{k'}} 164 | 165 | Parameters 166 | ---------- 167 | x: numpy ndarray 168 | Network logits as numpy array 169 | 170 | axis: int 171 | Dimension with classes 172 | 173 | Returns 174 | ------- 175 | logodds : numpy ndarray 176 | Logodds output 177 | """ 178 | n_classes = x.shape[1] 179 | all_logodds = [] 180 | for class_id in range(n_classes): 181 | logodds_c = x[...,class_id][..., np.newaxis] - custom_logsumexp( np.delete(x, class_id, axis=-1) , axis=-1) 182 | all_logodds.append(logodds_c.reshape(-1)) 183 | logodds = np.stack( all_logodds, axis=1 ) 184 | assert np.all(np.isfinite(logodds)) 185 | return logodds 186 | 187 | def custom_logsumexp(x, axis=-1): 188 | """ 189 | Uses the log-sum-exp trick. 190 | 191 | Parameters 192 | ---------- 193 | x: numpy ndarray 194 | Network logits as numpy array 195 | 196 | axis: int (default -1) 197 | axis along which to take the sum 198 | 199 | Returns 200 | ------- 201 | out: numpy ndarray 202 | log-sum-exp of x along some axis 203 | """ 204 | x_max = np.amax(x, axis=axis, keepdims=True) 205 | x_max[~np.isfinite(x_max)] = 0 206 | tmp = np.exp(x - x_max) 207 | s = np.sum(tmp, axis=axis, keepdims=True) 208 | s[s<=0] = EPS # only add epsilon when argument is zero 209 | out = np.log(s) 210 | out += x_max 211 | return out 212 | 213 | 214 | 215 | 216 | 217 | 218 | def to_onehot(y, num_classes): 219 | """ 220 | Convert 1D targets to one-hot repr. 221 | 222 | Parameters 223 | ---------- 224 | y: numpy 1D-array 225 | Array with sample target ids (i.e. 0 to -1) 226 | num_classes: int 227 | Number of classes 228 | 229 | Returns 230 | ------- 231 | y_onehot: numpy ndarray 232 | One-hot representation of target labels 233 | """ 234 | assert len(y.shape)==1 235 | y_onehot = np.eye(num_classes)[y] 236 | return y_onehot 237 | 238 | 239 | def binary_convertor(logodds, y, cal_setting, class_idx): 240 | """ 241 | Function to convert the logodds data (in multi-class setting) to binary setting. The following options are available: 242 | 1) CW - slice out some class: cal_setting="CW" and class_idx is not None (int) 243 | 2) top1 - max class for each sample: get the top1 prediction: cal_setting="top1" and class_idx is None 244 | 3) sCW - merge marginal setting where data is combined: cal_setting="sCW" and class_idx is None 245 | """ 246 | 247 | if cal_setting=="CW": 248 | assert class_idx is not None, "class_idx needs to be an integer to slice out class needed for CW calibration setting" 249 | logodds_c = logodds[..., class_idx] 250 | y_c = y[..., class_idx] if y is not None else None 251 | elif cal_setting=="top1": 252 | assert class_idx is None, "class_idx needs to be None - check" 253 | top1_indices = logodds.argmax(axis=-1) 254 | logodds_c = logodds[np.arange(top1_indices.shape[0]), top1_indices] 255 | y_c = y.argmax(axis=-1) == top1_indices if y is not None else None 256 | elif cal_setting=="sCW": 257 | assert class_idx is None, "class_idx needs to be None - check" 258 | logodds_c = np.concatenate(logodds.T) 259 | y_c = np.concatenate(y.T) if y is not None else None 260 | else: 261 | raise Exception("Calibration setting (%s) not recognized!"%(cal_setting)) 262 | 263 | return logodds_c, y_c 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import nn 4 | from imax_calib.evaluations import calibration_metrics as cal_metrics # Imax paper 5 | import calibration as cal # Kumar et al, Verified Uncertainty Calibration 6 | 7 | # Implements various metrics. 8 | 9 | 10 | def get_acc(y_pred, y_true): 11 | """ Computes the accuracy of predictions. 12 | If y_pred is 2D, it is assumed that it is a matrix of scores (e.g. probabilities) of shape (n_samples, n_classes) 13 | """ 14 | if y_pred.ndim == 1: 15 | return np.mean(y_pred == y_true) 16 | elif y_pred.ndim == 2: 17 | return np.mean(np.argmax(y_pred, axis=1), y_true) 18 | 19 | 20 | def get_cw_ECE(probs, y_true, mode='mass', threshold_mode='class', num_bins=15): 21 | """ Estimates the class-wise ECE by binning. 22 | 23 | Args: 24 | probs: shape (n_samples, n_classes) 25 | y_true: shape (n_samples, ) 26 | mode: Either 'mass' or 'width' -- determines binning scheme 27 | threshold_mode: Either 'class' or None -- determines if thresholding is used in estimation 28 | num_bins: Number of bins used in estimation 29 | """ 30 | 31 | if mode == 'mass': 32 | _mode = 'mECE' 33 | elif mode == 'width': 34 | _mode = 'dECE' 35 | 36 | evals = cal_metrics.compute_top_1_and_CW_ECEs(probs, y_true, list_approximators=[_mode], 37 | num_bins=num_bins, threshold_mode=threshold_mode) 38 | return evals[f'cw_{_mode}'] 39 | 40 | 41 | def get_ECE(probs, y_true, mode='mass', num_bins=15): 42 | """ Estimates the top-label ECE by binning. 43 | 44 | Args: 45 | probs: shape (n_samples, n_classes) 46 | y_true: shape (n_samples, ) 47 | mode: Either 'mass' or 'width' -- determines binning scheme 48 | num_bins: Number of bins used in estimation 49 | """ 50 | if mode == 'mass': 51 | _mode = 'mECE' 52 | elif mode == 'width': 53 | _mode = 'dECE' 54 | 55 | evals = cal_metrics.compute_top_1_and_CW_ECEs(probs, y_true, list_approximators=[_mode], num_bins=num_bins) 56 | return evals[f'top_1_{_mode}'] 57 | 58 | 59 | def get_MCE(probs, y_true): 60 | """ Estimates the class-wise ECE. Not recommended for use. 61 | """ 62 | return cal.get_calibration_error(probs, y_true, 63 | p=1, debias=False, mode='marginal') 64 | 65 | 66 | def get_NLL(probs, y_true): 67 | """ Computes the negative log likelihood. 68 | """ 69 | nll = nn.NLLLoss() 70 | _probs = np.clip(probs, 1e-100, 1) 71 | logprobs = torch.from_numpy(np.log(_probs)) 72 | return nll(logprobs, torch.from_numpy(y_true)).item() 73 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | from sklearn.metrics import confusion_matrix 4 | 5 | # This file implements various utility functions. 6 | 7 | 8 | def get_human_labels_outcomes(human_counts, true_labels, seed=0): 9 | """ Converts from the counts to an ordered list of votes. Also computes the 0/1 Bernoulli outcomes. 10 | """ 11 | rng = np.random.default_rng(seed) 12 | 13 | human_labels_per_input = np.sum(human_counts, axis=1) 14 | min_human_labels = int(min(human_labels_per_input)) 15 | n_rows = human_counts.shape[0] 16 | n_classes = human_counts.shape[1] 17 | 18 | human_labels = np.empty(shape=(n_rows, min_human_labels)) 19 | human_outcomes = np.empty(shape=(n_rows, min_human_labels)) 20 | for row in range(n_rows): 21 | temp = [] 22 | for i in range(n_classes): 23 | temp += [i] * int(human_counts[row, i]) 24 | rng.shuffle(temp) 25 | human_labels[row, :] = temp[:min_human_labels] 26 | human_outcomes[row, :] = (human_labels[row, :] == true_labels[row]).astype(int) 27 | 28 | return human_labels, human_outcomes 29 | 30 | 31 | def simulate_single_human(human_counts, seed=0): 32 | rng = np.random.default_rng(seed) 33 | 34 | human_labels_per_input = np.sum(human_counts, axis=1) 35 | min_human_labels = int(min(human_labels_per_input)) 36 | n_rows = human_counts.shape[0] 37 | n_classes = human_counts.shape[1] 38 | 39 | human_labels = np.empty(shape=(n_rows, min_human_labels)) 40 | for row in range(n_rows): 41 | temp = [] 42 | for i in range(n_classes): 43 | temp += [i] * int(human_counts[row, i]) 44 | rng.shuffle(temp) 45 | human_labels[row, :] = temp[:min_human_labels] 46 | 47 | return human_labels[:, 0].astype(int) 48 | 49 | 50 | class SoftLogLoss(nn.Module): 51 | # Implements the "soft-log-loss" for use with the EM algorithm 52 | def __init__(self): 53 | super(SoftLogLoss, self).__init__() 54 | 55 | def forward(self, input, target): 56 | # input is tensor of model logits (n_samples, n_cls) 57 | # target is tensor of weight matrix (n_samples, n_cls) 58 | # c.f. https://github.com/pytorch/pytorch/issues/7455 59 | log_probs = nn.functional.log_softmax(input, dim=-1) 60 | loss = -1. * (log_probs * target).sum(dim=-1).mean() 61 | return loss 62 | 63 | 64 | def get_model_confidence_ratio(model_probs, y_true, h=None, y=None, y_h=None, mode='diff'): 65 | # args h / y : condition on Y = y and/or h(X) = h 66 | # arg mode: 'max' or 'diff' -- determines denominator 67 | 68 | if (h is None) and (y is None): # Unconditional 69 | idxs = [True] * y_true.size 70 | elif h is None: # Distribution conditioned on Y only 71 | idxs = (y_true == y) 72 | elif y is None: # Distribution conditioned on h only 73 | idxs = (y_h == h) 74 | else: # Distribution conditioned on y and h 75 | idxs = np.logical_and((y_true == y), (y_h == h)) 76 | 77 | eps = 1e-16 78 | model_probs = model_probs.clip(eps, 1. - eps) 79 | 80 | n = y_true[idxs].size 81 | _model_probs = model_probs[idxs] 82 | _y_true = y_true[idxs] 83 | 84 | model_confidence_ratio = np.empty(n) 85 | for i in range(n): 86 | true_class_conf = _model_probs[i][y_true[i]] 87 | if mode == 'max': 88 | denom = np.max([conf for j, conf in enumerate(_model_probs[i]) if j != _y_true[i]]) 89 | elif mode == 'diff': 90 | denom = 1. - true_class_conf 91 | model_confidence_ratio[i] = true_class_conf / denom 92 | 93 | return model_confidence_ratio 94 | 95 | 96 | def get_human_confidence_ratio(y_h_tr, y_true_tr, y_h_te, y_true_te, n_cls, h=None, y=None, mode='diff'): 97 | # Estimate human confusion matrix 98 | # Entry [i, j] is #(Y = i and h = j) 99 | conf_h = 1. * confusion_matrix(y_true_tr, y_h_tr, labels=np.arange(n_cls)) 100 | # Swap so entry [i, j] is #(h = i and Y = j) 101 | conf_h = conf_h.T 102 | eps = 1e-50 103 | conf_h = np.clip(conf_h, eps, None) 104 | normalizer = np.sum(conf_h, axis=0, keepdims=True) 105 | # Normalize columns so entry [i, j] is P(h = i | Y = j) 106 | conf_h /= normalizer 107 | 108 | if (h is None) and (y is None): # Unconditional 109 | idxs = [True] * y_true_te.size 110 | elif h is None: # Distribution conditioned on Y only 111 | idxs = (y_true_te == y) 112 | elif y is None: # Distribution conditioned on h only 113 | idxs = (y_h_te == h) 114 | else: # Distribution conditioned on y and h 115 | return conf_h[h, y] / (1. - conf_h[h, y]) 116 | 117 | n = y_true_te[idxs].size 118 | _y_true = y_true_te[idxs] 119 | human_confidence_ratio = np.empty(n) 120 | for i in range(n): 121 | true_class_conf = conf_h[y_h_te[i], _y_true[i]] 122 | if mode == 'max': 123 | denom = np.max([conf for j, conf in enumerate(conf_h[y_h_te[i], :]) if j != _y_true[i]]) 124 | elif mode == 'diff': 125 | denom = 1. - true_class_conf 126 | human_confidence_ratio[i] = true_class_conf / denom 127 | 128 | return human_confidence_ratio 129 | 130 | 131 | def get_dirichlet_params(acc, strength, n_cls): 132 | # acc: desired off-diagonal accuracy 133 | # strength: strength of prior 134 | 135 | # Returns alpha,beta where the prior is Dir((beta, beta, . . . , alpha, . . . beta)) 136 | # where the alpha appears for the correct class 137 | 138 | beta = 0.1 139 | alpha = beta * (n_cls - 1) * acc / (1. - acc) 140 | 141 | alpha *= strength 142 | beta *= strength 143 | 144 | alpha += 1 145 | beta += 1 146 | 147 | return alpha, beta 148 | 149 | 150 | def diversity(y1, y2, y_t): 151 | y1_outcomes = (y1 == y_t) 152 | y2_outcomes = (y2 == y_t) 153 | 154 | n = y_t.size 155 | both_correct = sum((y1_outcomes == 1) & (y2_outcomes == 1)) / n 156 | both_incorrect = sum((y1_outcomes == 0) & (y2_outcomes == 0)) / n 157 | y1c_y2w = sum((y1_outcomes == 1) & (y2_outcomes == 0)) / n 158 | y1w_y2c = sum((y1_outcomes == 0) & (y2_outcomes == 1)) / n 159 | 160 | return both_correct, both_incorrect, y1c_y2w, y1w_y2c --------------------------------------------------------------------------------