├── .gitignore ├── README.md ├── clm ├── __init__.py ├── components.py ├── scaling.py ├── scoring.py ├── uncertainty.py └── utils.py ├── notebooks ├── C_inner-dm.py ├── C_inner.py ├── C_outer-dm.py ├── C_outer.py ├── plot-components.ipynb ├── plot.ipynb ├── qualitative.py └── run_one_trial.py ├── scripts ├── cnndm_components.py ├── cnndm_data.py ├── cnndm_intra_rouge.py ├── cxr_components.py ├── cxr_data.py ├── run_cnndm.sh ├── run_cnndm_components.sh ├── run_component_trials.py ├── run_cxr.sh ├── run_cxr_components.sh ├── run_example.py ├── run_many_components.py ├── run_trials.py ├── run_triviaqa.sh └── triviaqa_data.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | clm.egg-info 3 | data/ 4 | .ipynb_checkpoints 5 | data 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Conformal Language Modeling 2 | 3 | Code for [Conformal Language Modeling](https://arxiv.org/abs/2306.10193) for details 4 | 5 | ## Abstract 6 | 7 | In this paper, we propose a novel approach to conformal prediction for language models (LMs) in which we produce prediction sets with performance guarantees. LM responses are typically sampled from a predicted distribution over the large, combinatorial output space of language. Translating this to conformal prediction, we calibrate a stopping rule for sampling LM outputs that get added to a growing set of candidates until we are confident that the set covers at least one acceptable response. Since some samples may be low-quality, we also simultaneously calibrate a rejection rule for removing candidates from the output set to reduce noise. Similar to conformal prediction, we can prove that the final output set obeys certain desirable distribution-free guarantees. Within these sets of candidate responses, we also show that we can also identify subsets of individual components---such as phrases or sentences---that are each independently correct (e.g., that are not "hallucinations"), again with guarantees. Our method can be applied to any LM API that supports sampling. Furthermore, we empirically demonstrate that we can achieve many desired coverage levels within a limited number of total samples when applying our method to multiple tasks in open-domain question answering, text summarization, and radiology report generation using different LM variants. 8 | 9 | ## Data 10 | 11 | Also see our [auxiliary repo](https://github.com/Varal7/clm_aux) for data prepocessing. 12 | 13 | ## Citation 14 | 15 | If you use this in your work please cite: 16 | 17 | ``` 18 | @misc{quach2023conformal, 19 | title={Conformal Language Modeling}, 20 | author={Victor Quach and Adam Fisch and Tal Schuster and Adam Yala and Jae Ho Sohn and Tommi S. Jaakkola and Regina Barzilay}, 21 | year={2023}, 22 | eprint={2306.10193}, 23 | archivePrefix={arXiv}, 24 | primaryClass={cs.CL} 25 | } 26 | ``` 27 | -------------------------------------------------------------------------------- /clm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Varal7/conformal-language-modeling/13fd7ab8bf18c9748535280d590d1f5707faeb76/clm/__init__.py -------------------------------------------------------------------------------- /clm/components.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | import tqdm 4 | 5 | from clm import scaling 6 | from clm import utils 7 | from scipy.stats import binom 8 | 9 | # Default risk levels epsilon to calibrate. 10 | DEFAULT_EPSILONS = np.linspace(0, 1, 51) 11 | 12 | NAME_TO_SCALER = { 13 | 'platt': scaling.PlattScaler, 14 | 'bin': scaling.BinningScaler, 15 | 'platt_bin': scaling.PlattBinningScaler, 16 | 'none': scaling.BaseScaler, 17 | } 18 | 19 | 20 | def get_preds(scores, row_generation_idx_to_row_idx, K): 21 | """ 22 | row_generation_idx_to_row_idx: list of list of list of int 23 | The first list is for each example 24 | The second list is for each generation 25 | The third list is for each sentence in the generation 26 | The int is the index of the sentence in the flattened list 27 | 28 | returns: np.array of shape (num_examples, max_num_sentences) 29 | """ 30 | N = len(row_generation_idx_to_row_idx) 31 | g = len(row_generation_idx_to_row_idx[0]) 32 | 33 | preds = -np.ones((N, K)) 34 | 35 | for i in range(N): 36 | for j in range(g): 37 | for k in range(len(row_generation_idx_to_row_idx[i][j])): 38 | idx = row_generation_idx_to_row_idx[i][j][k] 39 | preds[i, idx] = scores[i][j][k] 40 | 41 | return preds 42 | 43 | def get_random_preds(row_reference_idx_to_row_idx, K): 44 | """ 45 | row_reference_idx_to_row_idx: list of list of int 46 | The first list is for each example 47 | The second list is for each sentence in the reference 48 | The int is the index of the sentence in the flattened list 49 | 50 | returns: np.array of shape (num_examples, max_num_sentence_refs) 51 | """ 52 | N = len(row_reference_idx_to_row_idx) 53 | g = len(row_reference_idx_to_row_idx[0]) 54 | 55 | preds = -np.ones((N, K)) 56 | 57 | for i in range(N): 58 | for j in range(g): 59 | for k in range(len(row_reference_idx_to_row_idx[i][j])): 60 | idx = row_reference_idx_to_row_idx[i][j][k] 61 | preds[i, idx] = np.random.rand() 62 | 63 | return preds 64 | 65 | def get_first_k_preds(row_reference_idx_to_row_idx, K, max_num_sentences=50): 66 | N = len(row_reference_idx_to_row_idx) 67 | g = len(row_reference_idx_to_row_idx[0]) 68 | 69 | preds = -np.ones((N, K)) 70 | 71 | for i in range(N): 72 | for j in range(g): 73 | for k in range(min(K, len(row_reference_idx_to_row_idx[i][j]))): 74 | idx = row_reference_idx_to_row_idx[i][j][k] 75 | pred = max(1.0 - (k / max_num_sentences), 0.0) 76 | preds[i, idx] = max(pred, preds[i, idx]) 77 | 78 | return preds 79 | 80 | def get_rouge_with_refs(row_rouge_scores, row_reference_idx_to_row_idx): 81 | """ 82 | row_rouge_scores: list of np.array of shape (num_sentences, num_sentences) 83 | The first list is for each example 84 | The np.array is the rouge score matrix of each sentence pair for that row 85 | row_reference_idx_to_row_idx: list of list of int 86 | The first list is for each example 87 | The second list is for each sentence in the reference 88 | The int is the index of the sentence in the flattened list 89 | 90 | returns: np.array of shape (num_examples, max_num_sentence_refs, max_num_sentences) 91 | """ 92 | N = len(row_rouge_scores) 93 | R = max(len(row_reference_idx_to_row_idx[i]) for i in range(N)) 94 | K = max(len(row_rouge_scores[i]) for i in range(N)) 95 | 96 | rouge_refs = -np.ones((N, R, K)) 97 | 98 | 99 | for i in range(N): 100 | arr = np.array(row_rouge_scores[i]) 101 | refs = row_reference_idx_to_row_idx[i] 102 | rouge_refs[i, :len(refs), :len(arr)] = arr[refs, :] 103 | 104 | return rouge_refs 105 | 106 | def get_rouge_label_for_C_inner(rouge_with_refs): 107 | """ 108 | rouge_with_refs: np.array of shape (num_examples, max_num_sentence_refs, max_num_sentences) 109 | 110 | returns: np.array of shape (num_examples, max_num_sentences) 111 | """ 112 | return rouge_with_refs.max(axis=1) 113 | 114 | 115 | def get_oracle_size_for_C_inner(rouge_with_refs): 116 | """ 117 | rouge_with_refs: np.array of shape (num_examples, max_num_sentence_refs, max_num_sentences) 118 | 119 | returns: np.array of shape (num_examples) 120 | """ 121 | return (rouge_with_refs > 0).any(axis=2).sum(axis=1) 122 | 123 | 124 | def compute_values_for_C_inner_basic(preds, rouge_label, mask, oracle_size, rouge_threshold=0.7, taus=None, num_quantiles=1000, subset=None): 125 | """ 126 | preds: np.array of shape (num_examples, max_num_sentences) 127 | rouge_label: np.array of shape (num_examples, max_num_sentences) 128 | mask: np.array of shape (num_examples, max_num_sentences) 129 | mask = 0 if the sentence is padding 130 | oracle_size: np.array of shape (num_examples) denoting the 131 | number of sentences in the reference 132 | 133 | 134 | returns: np.array of shape (num_examples, max_num_sentences) 135 | """ 136 | if taus is None: 137 | quantiles = np.linspace(0, 1, num_quantiles) 138 | taus = np.unique(np.quantile(preds.reshape(-1), quantiles)) 139 | taus = np.concatenate([[-np.inf], taus, [np.inf]]) 140 | taus = np.flip(taus) 141 | 142 | if subset is not None: 143 | preds = preds[subset] 144 | rouge_label = rouge_label[subset] 145 | mask = mask[subset] 146 | oracle_size = oracle_size[subset] 147 | 148 | has_refs = oracle_size > 0 149 | 150 | # Discard examples without references 151 | preds = preds[has_refs] 152 | rouge_label = rouge_label[has_refs] 153 | mask = mask[has_refs] 154 | 155 | oracle_size = oracle_size[has_refs].reshape(1, -1) # 1, num_examples 156 | 157 | accepted = rouge_label > rouge_threshold # num_examples x num_sentences 158 | accepted = np.expand_dims(accepted, axis=0) # 1 x num_examples x num_sentences 159 | 160 | chosen = (preds > taus.reshape(-1, 1, 1)) # num_taus x num_examples x num_sentences 161 | 162 | # Ok if [chosen implies accepted] 163 | ok = (~chosen | accepted).all(axis=-1) # num_taus x num_examples 164 | C_size = chosen.sum(axis=-1) # num_taus x num_examples 165 | 166 | # Additional metrics 167 | # Fake recall is counting the number of chosen sentences that are accepted and dividing by size of the reference 168 | # Recall is counting the number of reference sentences that have a match and dividing by the size of the reference 169 | 170 | C_relative_size = C_size / oracle_size 171 | # fake_recall = (chosen & accepted).sum(axis=-1) / oracle_size 172 | # intersect = np.expand_dims(accepted, axis=0) & np.expand_dims(chosen, axis=2) # taus x num_examples x num_refs x num_sentences 173 | # intersect = intersect.any(axis=-1) # taus x num_examples x num_refs 174 | # recall = intersect.sum(axis=-1) / oracle_size # taus x num_examples 175 | # micro_recall = intersect.sum(axis=-1).sum(axis=-1) / oracle_size.sum(axis=-1) # taus 176 | 177 | # conditioned 178 | C_size_conditioned = (C_size * ok).sum(axis=1) / ok.sum(axis=1) # num_taus 179 | # micro_recall_conditioned = (intersect.sum(axis=-1) * ok).sum(axis=1) / ((oracle_size * ok).sum(axis=-1) + 1e-10) 180 | 181 | 182 | return { 183 | 'L_avg': (1 - ok).mean(axis=-1), 184 | 'C_size_avg': C_size.mean(axis=-1), 185 | 'C_relative_size': C_relative_size.mean(axis=-1), 186 | # 'fake_recall': fake_recall.mean(axis=-1), 187 | # 'recall': recall.mean(axis=-1), 188 | # 'micro_recall': micro_recall, 189 | 'C_size_conditioned': C_size_conditioned, 190 | # 'micro_recall_conditioned': micro_recall_conditioned, 191 | } 192 | 193 | def compute_values_for_C_inner(preds, rouge_with_refs, rouge_threshold=0.7, taus=None, num_quantiles=1000, subset=None): 194 | if taus is None: 195 | quantiles = np.linspace(0, 1, num_quantiles) 196 | taus = np.unique(np.quantile(preds.reshape(-1), quantiles)) 197 | taus = np.concatenate([[-np.inf], taus, [np.inf]]) 198 | taus = np.flip(taus) 199 | 200 | if subset is not None: 201 | preds = preds[subset] 202 | rouge_with_refs = rouge_with_refs[subset] 203 | 204 | oracle_size = (rouge_with_refs > 0).any(axis=-1).sum(axis=-1) # num_examples 205 | has_refs = oracle_size > 0 206 | 207 | # Discard examples without references 208 | preds = preds[has_refs] 209 | rouge_with_refs = rouge_with_refs[has_refs] 210 | 211 | oracle_size = oracle_size[has_refs].reshape(1, -1) # 1, num_examples 212 | 213 | accepted = rouge_with_refs > rouge_threshold # num_examples x num_refs x num_sentences 214 | 215 | any_accepted = accepted.any(axis=1) # num_examples x num_sentences 216 | any_accepted = np.expand_dims(any_accepted, axis=0) # 1 x num_examples x num_sentences 217 | 218 | chosen = (preds > taus.reshape(-1, 1, 1)) # num_taus x num_examples x num_sentences 219 | 220 | # Ok if [chosen implies accepted] 221 | ok = (~chosen | any_accepted).all(axis=-1) # num_taus x num_examples 222 | C_size = chosen.sum(axis=-1) # num_taus x num_examples 223 | 224 | 225 | # Additional metrics 226 | # Fake recall is counting the number of chosen sentences that are accepted and dividing by size of the reference 227 | # Recall is counting the number of reference sentences that have a match and dividing by the size of the reference 228 | 229 | C_relative_size = C_size / oracle_size 230 | # fake_recall = (chosen & any_accepted).sum(axis=-1) / oracle_size 231 | intersect = np.expand_dims(accepted, axis=0) & np.expand_dims(chosen, axis=2) # taus x num_examples x num_refs x num_sentences 232 | intersect = intersect.any(axis=-1) # taus x num_examples x num_refs 233 | recall = intersect.sum(axis=-1) / oracle_size # taus x num_examples 234 | micro_recall = intersect.sum(axis=-1).sum(axis=-1) / oracle_size.sum(axis=-1) # taus 235 | 236 | # conditioned 237 | # C_size_conditioned = (C_size * ok).sum(axis=1) / ok.sum(axis=1) # num_taus 238 | micro_recall_conditioned = (intersect.sum(axis=-1) * ok).sum(axis=1) / ((oracle_size * ok).sum(axis=-1) + 1e-10) 239 | 240 | 241 | return { 242 | 'taus': taus, 243 | 'L_avg': (1 - ok).mean(axis=-1), 244 | 'C_size_avg': C_size.mean(axis=-1), 245 | 'C_relative_size': C_relative_size.mean(axis=-1), 246 | # 'fake_recall': fake_recall.mean(axis=-1), 247 | 'recall': recall.mean(axis=-1), 248 | 'micro_recall': micro_recall, 249 | # 'C_size_conditioned': C_size_conditioned, 250 | 'micro_recall_conditioned': micro_recall_conditioned, 251 | } 252 | 253 | 254 | def compute_values_for_C_outer(preds, rouge_with_multi_refs, rouge_threshold=0.7, taus=None, num_quantiles=1000, subset=None): 255 | """ 256 | rouge_with_multi_refs: max_num_refs x num_examples x max_num_sentence_refs x num_sentences 257 | preds: num_examples x num_sentences 258 | """ 259 | 260 | if taus is None: 261 | quantiles = np.linspace(0, 1, num_quantiles) 262 | taus = np.unique(np.quantile(preds.reshape(-1), quantiles)) 263 | taus = np.concatenate([[0], taus, [np.inf]]) 264 | taus = np.flip(taus) 265 | 266 | if subset is not None: 267 | preds = preds[subset] 268 | rouge_with_multi_refs = rouge_with_multi_refs[:, subset] 269 | 270 | # Compute oracle size based on true ref 271 | oracle_size = (rouge_with_multi_refs[0] > 0).any(axis=-1).sum(axis=-1) # num_examples 272 | has_refs = oracle_size > 0 273 | 274 | # Discard examples without references 275 | preds = preds[has_refs] 276 | rouge_with_multi_refs = rouge_with_multi_refs[:, has_refs] 277 | 278 | oracle_size = oracle_size[has_refs].reshape(1, -1) # 1, num_examples 279 | 280 | # For each (example, ref), how many sentences are actually there 281 | ref_sentence_padding = (rouge_with_multi_refs < 0).all(axis=-1) # max_num_refs x num_examples x max_num_sentence_refs 282 | 283 | # For each example, how many refs are actually there 284 | ref_mask = 1 - (rouge_with_multi_refs < 0).all(axis=-1).all(axis=-1) # max_num_refs x num_examples 285 | 286 | 287 | # accept if the rouge score is above the threshold 288 | accepted = rouge_with_multi_refs > rouge_threshold # max_num_refs x num_examples x max_num_sentence_refs x num_sentences 289 | r_accepted = np.expand_dims(accepted, axis=0) # 1 x max_num_refs x num_examples x max_num_sentence_refs x num_sentences 290 | 291 | # choose if preds is above the tau 292 | chosen = (preds > taus.reshape(-1, 1, 1)) # num_taus x num_examples x num_sentences 293 | r_chosen = np.expand_dims(chosen, axis=1) # num_taus x 1 x num_examples x num_sentences 294 | r_chosen = np.expand_dims(r_chosen, axis=3) # num_taus x 1 x num_examples x 1 x num_sentences 295 | 296 | # For each reference_sentence, ok if at least one of the chosen sentences is accepted 297 | accepted_and_chosen = r_accepted & r_chosen # num_taus x max_num_refs x num_examples x max_num_sentence_refs x num_sentences 298 | 299 | ok_ref_sen = (accepted_and_chosen).any(axis=4) # num_taus x max_num_refs x num_examples x max_num_sentence_refs 300 | 301 | # Ok if padding 302 | ok_ref_sen = ok_ref_sen | np.expand_dims(ref_sentence_padding, axis=0) # num_taus x max_num_refs x num_examples x max_num_sentence_refs 303 | 304 | # (tau, example, ref) ok if all reference sentences are ok 305 | ok_ref = ok_ref_sen.all(axis=3) # num_taus x max_num_refs x num_examples 306 | 307 | # must be a real reference 308 | ok_ref = ok_ref & np.expand_dims(ref_mask, axis=0) # num_taus x max_num_refs x num_examples 309 | 310 | # (tau, example) ok if any of the references are ok 311 | ok = ok_ref.any(axis=1) # num_taus x num_examples 312 | 313 | C_size = chosen.sum(axis=-1) # num_taus x num_examples 314 | 315 | # additional metrics 316 | accepted_sen = accepted_and_chosen.any(axis=3).any(axis=1) # num_taus x num_examples x num_sentences 317 | precision = accepted_sen.sum(axis=2) / (chosen.sum(axis=2) + 1e-10) # num_taus x num_examples 318 | C_relative_size = C_size / oracle_size 319 | micro_precision = accepted_sen.sum(axis=2).sum(axis=1) / (C_size.sum(axis=1) + 1e-10) # num_taus 320 | 321 | # conditioned 322 | micro_precision_conditioned = (accepted_sen.sum(axis=2) * ok).sum(axis=1) / ((C_size * ok).sum(axis=1) + 1e-10) # num_taus 323 | C_size_conditioned = (C_size * ok).sum(axis=1) / ok.sum(axis=1) # num_taus 324 | 325 | return { 326 | 'L_avg': (1 - ok).mean(axis=-1), 327 | 'C_size_avg': C_size.mean(axis=-1), 328 | 'C_relative_size': C_relative_size.mean(axis=-1), 329 | 'precision': precision.mean(axis=-1), 330 | 'micro_precision': micro_precision, 331 | 'micro_precision_conditioned': micro_precision_conditioned, 332 | 'C_size_conditioned': C_size_conditioned, 333 | 'oracle_size': oracle_size.repeat(len(taus), axis=0).mean(axis=-1), 334 | 'taus': taus, 335 | } 336 | 337 | 338 | def run_trials( 339 | train_data: utils.ComponentDataset, 340 | test_data: utils.ComponentDataset, 341 | scale_type="none", 342 | scale_kwargs=None, 343 | filter_for_answerable=False, 344 | rouge_threshold=0.4, 345 | num_quantiles=1000, 346 | epsilons=DEFAULT_EPSILONS, 347 | delta=0.05, 348 | p_cal=0.3, 349 | num_trials=100, 350 | ): 351 | """Run multiple random trials of CRC on Components 352 | 353 | Args: 354 | train_data: utils.ComponentDataset of (scores, rouge_with_refs). 355 | test_data: utils.ComponentDataset of (scores, rouge_with_refs). 356 | alphas: List of target risk levels to calibrate for. 357 | scaling: One of "platt", "bin", "platt_bin", or "none". 358 | methods: List of ((scale_type, scale_kwargs), score_type). 359 | If scale_type != 'none', then train_data is used to learn a scaler. 360 | Then the score_type maps to a function to compute set scores. 361 | p_cal: Percentage of test data to use for calibration in each trial. 362 | num_trials: Number of random trials to do. 363 | 364 | Returns: 365 | methods: List of evaluated methods. 366 | alphas: List of evaluated alphas. 367 | results: List of dict for each method mapping result type to a 368 | [num_trials, num_alphas] array of scores (see compute_values). 369 | """ 370 | # Compute scaled scores. 371 | 372 | scale_kwargs = scale_kwargs or {} 373 | scaler = NAME_TO_SCALER[scale_type](**scale_kwargs) 374 | scaler.fit(train_data.scores, train_data.rouge_with_refs) 375 | item_scores = (scaler.predict(test_data.scores)) 376 | 377 | # Initialize results. 378 | all_results = collections.defaultdict(list) 379 | 380 | # Calibrate and compute results across trials. 381 | N = len(test_data.scores) 382 | N_cal = int(p_cal * N) 383 | N_pcal = N_cal // 2 384 | for _ in tqdm.tqdm(range(num_trials), desc='running trials'): 385 | # Split into calibration and test sets. 386 | randperm = np.random.permutation(N) 387 | pareto_idx, test_idx = randperm[:N_cal], randperm[N_cal:] 388 | cal_idx, opt_idx = pareto_idx[:N_pcal], pareto_idx[N_pcal:] 389 | 390 | del opt_idx # We don't use pareto testing for components 391 | 392 | cal_item_scores = item_scores[cal_idx] 393 | cal_item_rouge = test_data.rouge_with_refs[cal_idx] 394 | 395 | test_item_scores = item_scores[test_idx] 396 | test_item_rouge = test_data.rouge_with_refs[test_idx] 397 | 398 | # If filtering for answerable, then only keep examples 399 | # where at least one generation is acceptable 400 | if filter_for_answerable: 401 | cal_labels = test_data.report_labels[cal_idx] 402 | test_labels = test_data.report_labels[test_idx] 403 | 404 | cal_oracle_subset = cal_labels.any(axis=1) 405 | test_oracle_subset = test_labels.any(axis=1) 406 | 407 | cal_item_scores = cal_item_scores[cal_oracle_subset] 408 | cal_item_rouge = cal_item_rouge[cal_oracle_subset] 409 | 410 | test_item_scores = test_item_scores[test_oracle_subset] 411 | test_item_rouge = test_item_rouge[test_oracle_subset] 412 | 413 | 414 | params = dict( 415 | rouge_threshold=rouge_threshold, 416 | num_quantiles=num_quantiles, 417 | ) 418 | 419 | values = compute_values_for_C_inner(cal_item_scores, cal_item_rouge, **params) 420 | 421 | # Choose best valid tau (already sorted in decreasing tau) 422 | best_valid_taus = [np.nan] * len(epsilons) 423 | is_stopped = [False] * len(epsilons) 424 | for (tau, loss) in zip(values['taus'], values['L_avg']): 425 | for j, epsilon in enumerate(epsilons): 426 | n = len(cal_idx) 427 | p_value = binom.cdf(n * loss, n, epsilon) 428 | if p_value <= delta and not is_stopped[j]: 429 | best_valid_taus[j] = tau 430 | else: 431 | is_stopped[j] = True 432 | best_valid_taus = np.array(best_valid_taus) 433 | 434 | # Compute test metrics. 435 | test_values = compute_values_for_C_inner(test_item_scores, test_item_rouge, taus=best_valid_taus, **params) 436 | 437 | # Record trial. 438 | for k, v in test_values.items(): 439 | all_results[k].append(v) 440 | 441 | # Aggregate results. 442 | combined = {} 443 | for k, v in all_results.items(): 444 | combined[k] = np.stack(v, axis=0) 445 | 446 | return epsilons, combined 447 | -------------------------------------------------------------------------------- /clm/scaling.py: -------------------------------------------------------------------------------- 1 | """Utilities for score scaling.""" 2 | 3 | import tqdm 4 | import numpy as np 5 | import torch 6 | import torch.optim as optim 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | from torch.utils.data import DataLoader, RandomSampler, TensorDataset 11 | from sklearn.linear_model import LogisticRegression 12 | 13 | 14 | class BaseScaler: 15 | """Straight through scaler.""" 16 | 17 | def fit(self, X, y): 18 | pass 19 | 20 | def predict(self, X): 21 | return X 22 | 23 | 24 | class PlattScaler(BaseScaler): 25 | """Parametric calibration using log. regression on uncalibrated scores.""" 26 | 27 | def __init__(self, *args, **kwargs): 28 | self.clf = LogisticRegression(*args, **kwargs) 29 | 30 | def fit(self, X, y): 31 | X = X.reshape(-1, 1) 32 | y = y.reshape(-1) 33 | self.clf.fit(X, y) 34 | 35 | def predict(self, X): 36 | scaled = self.clf.predict_proba(X.reshape(-1, 1))[:, 1] 37 | return scaled.reshape(X.shape) 38 | 39 | 40 | class BinningScaler(BaseScaler): 41 | """Non-parametric equal-mass histogram regression on uncalibrated scores.""" 42 | 43 | def __init__(self, n_bins=20): 44 | self.n_bins = n_bins 45 | 46 | def fit(self, X, y): 47 | X = X.reshape(-1) 48 | y = y.reshape(-1) 49 | 50 | # Split scores into equal mass bins. 51 | quantiles = np.linspace(0, 1, self.n_bins + 1) 52 | bins = np.percentile(X, quantiles * 100) 53 | bin_ids = np.searchsorted(bins[1:-1], X) 54 | 55 | # Count empirical accuracy in each bin. 56 | bin_true = np.bincount(bin_ids, weights=y, minlength=len(bins))[:-1] 57 | bin_count = np.bincount(bin_ids, minlength=len(bins))[:-1] 58 | bin_prob = bin_true / bin_count 59 | 60 | # Store bins and bin probs. 61 | self.bins = bins 62 | self.bin_prob = bin_prob 63 | 64 | def predict(self, X): 65 | bin_ids = np.searchsorted(self.bins[1:-1], X.reshape(-1)) 66 | return self.bin_prob[bin_ids].reshape(X.shape) 67 | 68 | 69 | class PlattBinningScaler(BaseScaler): 70 | """Combined parametric + non-parametric calibration (Kumar et. al., 2019)""" 71 | 72 | def __init__(self, *args, n_bins=20, **kwargs): 73 | self.platt = PlattScaler(*args, **kwargs) 74 | self.binning = BinningScaler(n_bins=n_bins) 75 | 76 | def fit(self, X, y): 77 | # Split calibration set in two for scaling and binning. 78 | N = len(X) // 2 79 | X_scale, y_scale = X[:N], y[:N] 80 | X_bin, y_bin = X[N:], y[N:] 81 | 82 | # Fit Platt scaler. 83 | self.platt.fit(X_scale, y_scale) 84 | 85 | # Fit binning scaler on Platt scaled scores. 86 | self.binning.fit(self.platt.predict(X_bin), y_bin) 87 | 88 | def predict(self, X): 89 | return self.binning.predict(self.platt.predict(X)) 90 | 91 | 92 | class RecurrentScaler(BaseScaler, nn.Module): 93 | """RNN calibration of C given sequential scores.""" 94 | 95 | def __init__( 96 | self, 97 | hidden_size=64, 98 | num_layers=2, 99 | num_iters=1000, 100 | batch_size=32, 101 | dropout=0.0, 102 | target='set', 103 | ): 104 | super(RecurrentScaler, self).__init__() 105 | assert(target in ['set', 'item']) 106 | self.num_iters = num_iters 107 | self.batch_size = batch_size 108 | self.target = target 109 | self.rnn = nn.RNN( 110 | input_size=1, 111 | hidden_size=hidden_size, 112 | num_layers=num_layers, 113 | dropout=dropout, 114 | batch_first=True) 115 | self.hidden_to_output = nn.Linear(hidden_size, 1) 116 | 117 | def forward(self, X): 118 | out = self.rnn(X.unsqueeze(-1))[0] 119 | out = self.hidden_to_output(out).squeeze(-1) 120 | return out 121 | 122 | def fit(self, X, y): 123 | # Convert dataset. 124 | X, y = torch.from_numpy(X).float(), torch.from_numpy(y).float() 125 | 126 | # If self.target = 'set', then we compute the set-based labels. 127 | if self.target == 'set': 128 | y = 1 - np.cumprod(1 - y, axis=-1) 129 | 130 | dataset = TensorDataset(X, y) 131 | sampler = RandomSampler( 132 | dataset, num_samples=self.num_iters * self.batch_size) 133 | loader = DataLoader( 134 | dataset, sampler=sampler, batch_size=self.batch_size) 135 | 136 | # Train loop. 137 | optimizer = optim.Adam(self.parameters(), lr=1e-3) 138 | self.train() 139 | for X_batch, y_batch in tqdm.tqdm(loader, 'fitting rnn'): 140 | out = self.forward(X_batch) 141 | loss = F.binary_cross_entropy_with_logits(out, y_batch) 142 | optimizer.zero_grad() 143 | loss.backward() 144 | optimizer.step() 145 | 146 | def predict(self, X): 147 | X = torch.from_numpy(X).float() 148 | self.eval() 149 | with torch.no_grad(): 150 | out = torch.sigmoid(self.forward(X)) 151 | return out.numpy() 152 | -------------------------------------------------------------------------------- /clm/scoring.py: -------------------------------------------------------------------------------- 1 | """Implementation of different scoring variants.""" 2 | 3 | import numpy as np 4 | 5 | 6 | def geometric(p, mask=None): 7 | """Score of a set is based on a geometric distribution approximation: 8 | 9 | p(\exists y \in C : L(y) = 0) = 1 - \prod 1 - p(A(y_i) = 1) 10 | 11 | This is consistent with -sum \log (1 - p(A(y_i) = 1)). 12 | 13 | Args: 14 | p: Matrix of size [num_examples, max_size]. 15 | Each entry of p approximates p(A(y_ij) = 1). 16 | 17 | Returns: 18 | Log geometric scores. 19 | """ 20 | if mask is not None: 21 | p = p * mask 22 | p = np.maximum(1 - p, 1e-8) 23 | return -np.cumsum(np.log(p), axis=-1) 24 | 25 | 26 | def marginal(p, mask=None): 27 | """Similar to geometric, but with p(y_k is the only y with A(y) = 1).""" 28 | if mask is not None: 29 | p = p * mask 30 | p = np.maximum(1 - p, 1e-8) 31 | shifted = np.pad(p, ((0, 0), (1, 0)), constant_values=1)[:, :-1] 32 | return -np.log(1 - p) - np.cumsum(np.log(shifted), axis=-1) 33 | 34 | 35 | def first_k(X, mask=None): 36 | """Scores are equal to the number of draws.""" 37 | num_examples, max_generations = X.shape 38 | scores = np.ones((num_examples, max_generations)) 39 | if mask is not None: 40 | scores = scores * mask 41 | return np.cumsum(scores, axis=-1) 42 | 43 | def first_k_no_mask(X, mask=None): 44 | """Scores are equal to the number of draws.""" 45 | del mask 46 | num_examples, max_generations = X.shape 47 | scores = np.ones((num_examples, max_generations)) 48 | return np.cumsum(scores, axis=-1) 49 | 50 | 51 | def max(X, mask=None): 52 | if mask is not None: 53 | X = X * mask 54 | return np.maximum.accumulate(X, axis=-1) 55 | 56 | 57 | def sum(X, mask=None): 58 | if mask is not None: 59 | X = X * mask 60 | return np.cumsum(X, axis=-1) 61 | -------------------------------------------------------------------------------- /clm/uncertainty.py: -------------------------------------------------------------------------------- 1 | """Evaluate set uncertainty metrics.""" 2 | 3 | import collections 4 | import itertools 5 | import tqdm 6 | import multiprocessing 7 | import functools 8 | import numpy as np 9 | from scipy.stats import binom 10 | 11 | from clm import scaling 12 | from clm import scoring 13 | from clm import utils 14 | 15 | NAME_TO_SCALER = { 16 | 'platt': scaling.PlattScaler, 17 | 'bin': scaling.BinningScaler, 18 | 'platt_bin': scaling.PlattBinningScaler, 19 | 'rnn': scaling.RecurrentScaler, 20 | 'none': scaling.BaseScaler, 21 | } 22 | 23 | NAME_TO_SCORE = { 24 | 'geo': scoring.geometric, 25 | 'marginal': scoring.marginal, 26 | 'first_k': scoring.first_k, 27 | 'first_k_no_mask': scoring.first_k_no_mask, 28 | 'max': scoring.max, 29 | 'sum': scoring.sum, 30 | 'none': lambda x, m=None: x, 31 | } 32 | 33 | DEFAULT_METHODS = [ 34 | dict(scaling=('none', {}), scoring='first_k', rejection=False), 35 | dict(scaling=('none', {}), scoring='first_k', rejection=True), 36 | # dict(scaling=('none', {}), scoring='max', rejection=False), 37 | dict(scaling=('none', {}), scoring='max', rejection=True), 38 | dict(scaling=('none', {}), scoring='sum', rejection=True), 39 | dict(scaling=('none', {}), scoring='first_k_no_mask', rejection=True), 40 | # dict(scaling=('platt', {}), scoring='geo', rejection=False), 41 | # dict(scaling=('platt', {}), scoring='geo', rejection=True), 42 | ] 43 | 44 | # Default risk levels epsilon to calibrate. 45 | DEFAULT_EPSILONS = np.linspace(0, 1, 101) 46 | 47 | 48 | def compute_worst_case(values, losses, num_bins=20): 49 | """Compute worst case conditional loss. 50 | 51 | Values are binned by equal mass, and then we compute the loss over each 52 | bin. We then return the worst loss over all bins. 53 | 54 | Args: 55 | values: [num_examples] 56 | Array of example values (e.g., set size). 57 | losses: [num_examples] 58 | Array of set losses. 59 | 60 | Returns: 61 | [num_thresholds] array of worst-case average losses. 62 | """ 63 | bins = np.quantile(values, np.linspace(0, 1, num_bins)) 64 | binids = np.digitize(values, [0] + np.unique(bins)) 65 | 66 | L_worst_avg = -1 67 | for binid in np.unique(binids): 68 | kept = binids == binid 69 | num_kept_examples = np.maximum(np.sum(kept), 1) 70 | Ls_mask_avg = np.sum(losses * kept) / num_kept_examples 71 | L_worst_avg = max(L_worst_avg, Ls_mask_avg) 72 | 73 | return L_worst_avg 74 | 75 | 76 | def get_C_cutoff(set_scores, set_lambda): 77 | """Compute prediction sets C for given thresholds tau. 78 | 79 | Args: 80 | set_scores: [num_examples, max_generations] 81 | set_scores[i, j] = score of set j for example i. 82 | set_lambda: Threshold to use. 83 | 84 | Returns: 85 | C_indices: [num_examples] 86 | Indices of the selected sets C for each example. 87 | """ 88 | cummax_scores = np.maximum.accumulate(set_scores, axis=-1) 89 | mask = cummax_scores < set_lambda 90 | C_indices = np.sum(mask, axis=-1) 91 | C_indices = C_indices.clip(0, set_scores.shape[1] - 1) 92 | return C_indices 93 | 94 | 95 | def compute_values( 96 | config, 97 | item_scores, 98 | similarity_scores, 99 | item_labels, 100 | set_score_fn, 101 | do_rejection=True, 102 | return_indices=False, 103 | ): 104 | """Compute list of metrics for a given config. 105 | 106 | Args: 107 | set_scores: [num_examples, max_generations] 108 | set_scores[i, j] = score of set after sample j for example i. 109 | set_sizes: [num_examples, max_generations] 110 | set_sizes[i, j] = effective set size after sample j for example i. 111 | set_losses: [num_examples, max_generations] 112 | set_loss[i, j] = loss of set after sample j for example i. 113 | lambdas: [num_thresholds] 114 | Array of thresholds to test. 115 | 116 | Returns: 117 | Dictionary of metrics (per lambda). 118 | """ 119 | if np.any([np.isnan(l) for l in config]): 120 | return dict( 121 | L_avg=np.nan, 122 | L_worst_pred_avg=np.nan, 123 | C_size_avg=np.nan, 124 | C_samples_avg=np.nan, 125 | C_excess_avg=np.nan, 126 | C_relative_excess_avg=np.nan, 127 | C_obj_avg=np.nan, 128 | ) 129 | 130 | lambda_1, lambda_2, lambda_3 = config 131 | 132 | # If doing rejections, remove low quality and low diversity items. 133 | if do_rejection: 134 | # Reject low-quality examples. 135 | quality_mask = (item_scores >= lambda_2) 136 | 137 | # Reject examples that are too similar to previous items. 138 | similarity_mask = np.ones_like(quality_mask) 139 | 140 | # Make similarity score symmetric. 141 | similarity_scores = np.maximum(similarity_scores, similarity_scores.transpose((0, 2, 1))) 142 | 143 | # Low quality scores are rejected, so they don't count --- set similarity to 0. 144 | similarity_scores = similarity_scores * np.expand_dims(quality_mask, 1) 145 | 146 | for k in range(1, similarity_scores.shape[1]): 147 | # Anything that has been rejected up until this point also doesn't count. 148 | # Set those scores to 0 so we don't pick them up as well. 149 | similarity_scores = similarity_scores * np.expand_dims(similarity_mask, 1) 150 | 151 | # We only look at scores up until this point. 152 | max_similarity = np.max(similarity_scores[:, k, :k], axis=-1) 153 | similarity_mask[:, k] = max_similarity <= lambda_1 154 | 155 | # Combine rejection rules. 156 | kept_mask = quality_mask * similarity_mask 157 | else: 158 | kept_mask = np.ones_like(item_scores) 159 | 160 | # Get set losses, scores, and sizes. 161 | set_sizes = np.cumsum(kept_mask, axis=-1) 162 | set_losses = utils.set_losses_from_labels(item_labels * kept_mask) 163 | set_scores = set_score_fn(item_scores, kept_mask) 164 | 165 | # Compute set selections for all values of lambda. 166 | C_indices = get_C_cutoff(set_scores, lambda_3) 167 | 168 | # Compute selected set losses. 169 | num_examples = set_losses.shape[0] 170 | Ls = set_losses[np.arange(num_examples), C_indices] 171 | L_avg = np.mean(Ls) 172 | 173 | # Compute effective set sizes. 174 | C_sizes = set_sizes[np.arange(num_examples), C_indices] 175 | C_size_avg = np.mean(C_sizes) 176 | 177 | # Compute the oracle number of samples to draw. 178 | max_generations = set_losses.shape[1] 179 | set_losses_without_rejection = utils.set_losses_from_labels(item_labels) 180 | C_oracle_samples = np.sum(set_losses_without_rejection > 0, axis=-1) + 1 181 | C_oracle_samples = C_oracle_samples.clip(0, max_generations) 182 | 183 | # Compute number of sample metrics. 184 | C_samples = C_indices + 1 185 | C_samples_avg = np.mean(C_samples) 186 | C_excess_avg = np.mean(np.maximum(C_samples - C_oracle_samples, 0)) 187 | C_relative_excess_avg = np.mean( 188 | np.maximum(C_samples - C_oracle_samples, 0) / C_samples) 189 | 190 | # Compute worst-case set losses, conditioned on predicted size. 191 | L_worst_pred_avg = compute_worst_case(C_sizes, Ls) 192 | 193 | output = dict( 194 | # Average loss. 195 | L_avg=L_avg, 196 | # Worst case loss binned by set size. 197 | L_worst_pred_avg=L_worst_pred_avg, 198 | # Average set size. 199 | C_size_avg=C_size_avg / max_generations, 200 | # Average number of samples. 201 | C_samples_avg=C_samples_avg / max_generations, 202 | # Average excess samples. 203 | C_excess_avg=C_excess_avg, 204 | # Relative excess, 205 | C_relative_excess_avg=C_relative_excess_avg, 206 | # Combined metric. 207 | C_obj_avg=C_samples_avg + C_size_avg, 208 | ) 209 | 210 | if return_indices: 211 | output.update( 212 | dict( 213 | C_indices=C_indices, 214 | kept_mask=kept_mask, 215 | Ls=Ls, 216 | ) 217 | ) 218 | 219 | return output 220 | 221 | 222 | def get_pareto_frontier( 223 | item_scores, 224 | similarity_scores, 225 | item_labels, 226 | set_score_fn, 227 | do_rejection, 228 | ): 229 | """Compute a pareto frontier.""" 230 | lambda_1 = utils.select_lambdas(similarity_scores, max_lambdas=25) 231 | lambda_2 = utils.select_lambdas(item_scores, max_lambdas=25) 232 | lambda_3 = utils.select_lambdas(set_score_fn(item_scores), max_lambdas=25) 233 | 234 | configs = [] 235 | costs = [] 236 | for config in tqdm.tqdm(itertools.product(lambda_1, lambda_2, lambda_3)): 237 | configs.append(config) 238 | values = compute_values( 239 | config=config, 240 | item_scores=item_scores, 241 | similarity_scores=similarity_scores, 242 | item_labels=item_labels, 243 | set_score_fn=set_score_fn, 244 | do_rejection=do_rejection) 245 | costs.append((values['L_avg'], values['C_obj_avg'])) 246 | 247 | configs = np.array(configs) 248 | costs = np.array(costs) 249 | 250 | is_efficient = np.ones(costs.shape[0], dtype=bool) 251 | for i, c in enumerate(costs): 252 | if is_efficient[i]: 253 | is_efficient[is_efficient] = np.any(costs[is_efficient] < c, axis=1) 254 | is_efficient[i] = True 255 | 256 | pareto_configs = configs[is_efficient] 257 | pareto_costs = costs[is_efficient] 258 | sort_idx = np.argsort(pareto_costs[:, 0]) 259 | pareto_costs = pareto_costs[sort_idx] 260 | ordered_configs = pareto_configs[sort_idx] 261 | 262 | return ordered_configs 263 | 264 | 265 | def init(shared_data): 266 | global SHARED_DATA 267 | SHARED_DATA = shared_data 268 | 269 | 270 | def run(args, methods, epsilons, delta): 271 | global SHARED_DATA 272 | all_item_scores, test_data = SHARED_DATA 273 | opt_idx, cal_idx, test_idx = args 274 | 275 | # Compute results for different methods. 276 | all_trial_results = [] 277 | for i, item_scores in enumerate(all_item_scores): 278 | opt_item_scores = item_scores[opt_idx] 279 | opt_similarity_scores = test_data.similarity[opt_idx] 280 | opt_item_labels = test_data.labels[opt_idx] 281 | 282 | cal_item_scores = item_scores[cal_idx] 283 | cal_similarity_scores = test_data.similarity[cal_idx] 284 | cal_item_labels = test_data.labels[cal_idx] 285 | 286 | test_item_scores = item_scores[test_idx] 287 | test_similarity_scores = test_data.similarity[test_idx] 288 | test_item_labels = test_data.labels[test_idx] 289 | 290 | # Get scoring mechanism for this method. 291 | set_score_fn = NAME_TO_SCORE[methods[i].get('scoring', 'none')] 292 | do_rejection = methods[i].get('rejection', False) 293 | 294 | # Get pareto frontier for Pareto Testing. 295 | configs = get_pareto_frontier( 296 | item_scores=opt_item_scores, 297 | similarity_scores=opt_similarity_scores, 298 | item_labels=opt_item_labels, 299 | set_score_fn=set_score_fn, 300 | do_rejection=do_rejection) 301 | 302 | # Choose best valid configs (configs are already ordered). 303 | best_valid_configs = [[np.nan] * 3] * len(epsilons) 304 | is_stopped = [False] * len(epsilons) 305 | for config in configs: 306 | values = compute_values( 307 | config=config, 308 | item_scores=cal_item_scores, 309 | similarity_scores=cal_similarity_scores, 310 | item_labels=cal_item_labels, 311 | set_score_fn=set_score_fn, 312 | do_rejection=do_rejection) 313 | 314 | for j, epsilon in enumerate(epsilons): 315 | loss = values['L_avg'] 316 | n = len(cal_idx) 317 | p_value = binom.cdf(n * loss, n, epsilon) 318 | if p_value <= delta and not is_stopped[j]: 319 | best_valid_configs[j] = config 320 | else: 321 | is_stopped[j] = True 322 | 323 | # Compute test metrics. 324 | trial_results = collections.defaultdict(list) 325 | trial_results['configs'] = np.array(best_valid_configs) 326 | for j, config in enumerate(best_valid_configs): 327 | values = compute_values( 328 | config=config, 329 | item_scores=test_item_scores, 330 | similarity_scores=test_similarity_scores, 331 | item_labels=test_item_labels, 332 | set_score_fn=set_score_fn, 333 | do_rejection=do_rejection) 334 | for k, v in values.items(): 335 | trial_results[k].append(v) 336 | all_trial_results.append(trial_results) 337 | 338 | return all_trial_results 339 | 340 | 341 | def run_trials( 342 | train_data, 343 | test_data, 344 | epsilons=DEFAULT_EPSILONS, 345 | methods=DEFAULT_METHODS, 346 | delta=0.05, 347 | p_cal=0.3, 348 | num_trials=100, 349 | num_processes=1, 350 | ): 351 | """Run multiple random trials of CRC. 352 | 353 | Args: 354 | train_data: utils.Dataset of (scores, labels). 355 | test_data: utils.Dataset of (scores, labels). 356 | alphas: List of target risk levels to calibrate for. 357 | methods: List of ((scale_type, scale_kwargs), score_type). 358 | If scale_type != 'none', then train_data is used to learn a scaler. 359 | Then the score_type maps to a function to compute set scores. 360 | p_cal: Percentage of test data to use for calibration in each trial. 361 | num_trials: Number of random trials to do. 362 | 363 | Returns: 364 | methods: List of evaluated methods. 365 | alphas: List of evaluated alphas. 366 | results: List of dict for each method mapping result type to a 367 | [num_trials, num_alphas] array of scores (see compute_values). 368 | """ 369 | # Compute scaled scores. 370 | all_item_scores = [] 371 | for method in tqdm.tqdm(methods, desc='scaling'): 372 | scale_type, scale_kwargs = method.get('scaling', ('none', {})) 373 | scaler = NAME_TO_SCALER[scale_type](**scale_kwargs) 374 | scaler.fit(train_data.scores, train_data.labels) 375 | all_item_scores.append(scaler.predict(test_data.scores)) 376 | 377 | # Initialize results. 378 | all_results = [collections.defaultdict(list) for _ in range(len(methods))] 379 | run_fn = functools.partial(run, methods=methods, delta=delta, epsilons=epsilons) 380 | if num_processes > 1: 381 | pool = multiprocessing.Pool( 382 | num_processes, 383 | initializer=init, 384 | initargs=((all_item_scores, test_data),)) 385 | map_fn = pool.imap_unordered 386 | else: 387 | global SHARED_DATA 388 | SHARED_DATA = (all_item_scores, test_data) 389 | map_fn = map 390 | 391 | # Calibrate and compute results across trials. 392 | N = len(test_data.scores) 393 | N_cal = int(p_cal * N) 394 | N_pcal = int(0.7 * N_cal) 395 | splits = [] 396 | for _ in range(num_trials): 397 | randperm = np.random.permutation(N) 398 | pareto_idx, test_idx = randperm[:N_cal], randperm[N_cal:] 399 | cal_idx, opt_idx = pareto_idx[:N_pcal], pareto_idx[N_pcal:] 400 | splits.append((opt_idx, cal_idx, test_idx)) 401 | 402 | with tqdm.tqdm(total=num_trials, desc='running trials') as pbar: 403 | for trial_results in map_fn(run_fn, splits): 404 | for i, method_results in enumerate(trial_results): 405 | for k, v in method_results.items(): 406 | all_results[i][k].append(np.array(v)) 407 | pbar.update() 408 | 409 | # Aggregate results. 410 | combined_results = [] 411 | for results in all_results: 412 | combined = {} 413 | for k, v in results.items(): 414 | combined[k] = np.stack(v, axis=0) 415 | combined_results.append(combined) 416 | 417 | return methods, epsilons, combined_results 418 | -------------------------------------------------------------------------------- /clm/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for experiments.""" 2 | 3 | import collections 4 | import random 5 | import torch 6 | import numpy as np 7 | import prettytable as pt 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | 11 | sns.set_style('ticks') 12 | sns.set_context('paper', font_scale=1.15, rc={'figure.figsize': (5, 4)}) 13 | 14 | Dataset = collections.namedtuple('Dataset', ['scores', 'similarity', 'labels']) 15 | ComponentDataset = collections.namedtuple('ComponentDataset', ['scores', 'rouge_with_refs', 'report_labels']) 16 | 17 | # Throw out alpha if more than this percent of trials are 18 | # either uncalibratable or trivial (first example satisfies). 19 | MAX_INVIABLE_THRESHOLD = 0.75 20 | 21 | COLORS = [ 22 | u"#1f77b4", 23 | u"#2ca02c", 24 | u"#ff7f0e", 25 | u"#9467bd", 26 | u"#8c564b", 27 | u"#e377c2", 28 | u"#7f7f7f", 29 | u"#bcbd22", 30 | u"#17becf", 31 | ] 32 | 33 | 34 | def set_seed(seed=0): 35 | """Set random seed.""" 36 | np.random.seed(seed) 37 | torch.manual_seed(seed) 38 | random.seed(seed) 39 | 40 | 41 | def assert_monotone(array, axis=-1): 42 | """Check if array is monotone increasing along axis.""" 43 | assert(np.all(np.diff(array, axis=-1) <= 0)) 44 | 45 | 46 | def select_lambdas(values, max_lambdas=1000): 47 | """Select unique quantiles of the empirical distribution.""" 48 | quantiles = np.linspace(0, 1, max_lambdas) 49 | lambdas = np.unique(np.quantile(values, quantiles, method='nearest')) 50 | lambdas = np.concatenate([[-np.inf], lambdas, [np.inf]]) 51 | return lambdas 52 | 53 | 54 | def set_losses_from_labels(set_labels): 55 | """Given individual labels, compute set loss.""" 56 | return np.cumprod(1 - set_labels, axis=-1) 57 | 58 | 59 | def compute_auc(xs, ys): 60 | """Compute area under the curve, restricted to support.""" 61 | area = np.trapz(ys, xs) / (max(xs) - min(xs) + 0.02) 62 | return area 63 | 64 | 65 | def plot_results( 66 | results, 67 | epsilons, 68 | y_axis, 69 | y_axis_name=None, 70 | title=None, 71 | show_calibrated_only=True, 72 | add_diagonal=False, 73 | ylim_is_xlim=False, 74 | colors=None, 75 | ax=None, 76 | **kwargs, 77 | ): 78 | """Plot y_axis vs. alphas given a dict of {method: {y_axis: values}}.""" 79 | if ax is None: 80 | _, ax = plt.subplots(1, 1) 81 | 82 | mask = np.zeros_like(epsilons) 83 | for values in results.values(): 84 | mask += np.mean(np.all(np.isnan(values['configs']), -1), 0) > 0 85 | mask += np.maximum.accumulate(np.mean(np.all(np.isinf(values['configs']), -1), 0)) > 0.05 86 | 87 | mask = mask == 0 88 | if show_calibrated_only: 89 | epsilons = epsilons[mask] 90 | 91 | for i, (name, values) in enumerate(results.items()): 92 | values = values[y_axis] 93 | if show_calibrated_only: 94 | values = values[:, mask] 95 | 96 | middle = np.mean(values, axis=0) 97 | std = np.std(values, axis=0) 98 | lower = middle - std 99 | upper = middle + std 100 | 101 | auc = compute_auc(epsilons, middle) 102 | label = f'{name} ({auc:.2f})' 103 | color = colors[name] if colors else COLORS[i] 104 | ax.plot(epsilons, middle, label=label, color=color, **kwargs) 105 | ax.fill_between(epsilons, lower, upper, color=color, alpha=0.2) 106 | 107 | if add_diagonal: 108 | diag_auc = compute_auc(epsilons, epsilons) 109 | ax.plot(epsilons, epsilons, '--', label=f'diagonal ({diag_auc:.2f})', color='k') 110 | 111 | ax.legend() 112 | ax.set_xlim(min(epsilons) - 0.01, max(epsilons) + 0.01) 113 | if ylim_is_xlim: 114 | ax.set_ylim(min(epsilons) - 0.01, max(epsilons) + 0.01) 115 | 116 | ax.set_xlabel(r'Risk Level ($\epsilon$)') 117 | ax.set_ylabel(y_axis_name or y_axis) 118 | if title: 119 | ax.set_title(title) 120 | 121 | return ax 122 | 123 | 124 | def print_methods(methods): 125 | table = pt.PrettyTable(field_names=['id', 'scaling', 'scoring', 'rejection']) 126 | for i, method in enumerate(methods): 127 | table.add_row([i, method['scaling'], method['scoring'], method['rejection']]) 128 | print(table) 129 | -------------------------------------------------------------------------------- /notebooks/C_inner-dm.py: -------------------------------------------------------------------------------- 1 | # %cd .. 2 | # %load_ext autoreload 3 | # %autoreload 2 4 | 5 | import torch 6 | import os 7 | import numpy as np 8 | import json 9 | from sklearn.metrics import roc_auc_score 10 | from tqdm.auto import tqdm 11 | import matplotlib.pyplot as plt 12 | 13 | from clm.components import ( 14 | get_rouge_with_refs, 15 | get_preds, 16 | get_random_preds, 17 | get_first_k_preds, 18 | compute_values_for_C_inner, 19 | ) 20 | 21 | base = "data/cnndm/val" 22 | 23 | with open(os.path.join(base, "row_rouge.jsonl")) as f: 24 | row_rouge_scores = [json.loads(line) for line in tqdm(f)] 25 | with open(os.path.join(base, "row_generation.jsonl")) as f: 26 | row_generation_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 27 | with open(os.path.join(base, "row_reference.jsonl")) as f: 28 | row_reference_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 29 | 30 | # + 31 | with open(os.path.join(base, "probs_scores.jsonl")) as f: 32 | prob_scores = [json.loads(line) for line in tqdm(f)] 33 | 34 | with open(os.path.join(base, "nli_scores.jsonl")) as f: 35 | nli_scores = [json.loads(line) for line in tqdm(f)] 36 | 37 | # + 38 | rouge_with_refs = get_rouge_with_refs(row_rouge_scores, row_reference_idx_to_row_idx) 39 | _, _, K = rouge_with_refs.shape 40 | 41 | nli_preds = get_preds(nli_scores, row_generation_idx_to_row_idx, K) 42 | probs_preds = get_preds(prob_scores, row_generation_idx_to_row_idx, K) 43 | random_preds = get_random_preds(row_generation_idx_to_row_idx, K) 44 | firstk_preds = get_first_k_preds(row_generation_idx_to_row_idx, K) 45 | 46 | # + 47 | kwargs = { 48 | "rouge_threshold": 0.2, 49 | "num_quantiles": 1000, 50 | } 51 | 52 | results = { 53 | 'NLI Scores': compute_values_for_C_inner(nli_preds, rouge_with_refs, **kwargs), 54 | 'Prob Scores': compute_values_for_C_inner(probs_preds, rouge_with_refs, **kwargs), 55 | 'Random': compute_values_for_C_inner(random_preds, rouge_with_refs, **kwargs), 56 | 'First-K': compute_values_for_C_inner(firstk_preds, rouge_with_refs, **kwargs), 57 | } 58 | 59 | 60 | # - 61 | 62 | def plot_values(results, x_axis, y_axis, ax=None, **kwargs): 63 | """Plot y_axis vs. x_axis given a dict of {method: [(size, loss), ...]}.""" 64 | if ax is None: 65 | _, ax = plt.subplots(1, 1) 66 | 67 | for method, values in results.items(): 68 | x_values = values[x_axis] 69 | y_values = values[y_axis] 70 | ax.plot(x_values, y_values, label=method, **kwargs) 71 | 72 | ax.set_ylabel(y_axis) 73 | ax.set_xlabel(x_axis) 74 | ax.legend() 75 | return ax 76 | 77 | 78 | plt.rcParams['figure.dpi'] = 100 79 | fig, ax = plt.subplots(1, 1) 80 | plot_values(results, 'L_avg', 'C_size_avg', ax=ax, marker=".") 81 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 82 | ax.set_ylabel("$\mathbb{E}[|C_{inner}|]$") 83 | ax.set_xlim(0, 0.95) 84 | ax.set_ylim(0, 5) 85 | ax.set_title(f"C_inner, with rougeL_threshold = {kwargs['rouge_threshold']}") 86 | plt.show() 87 | 88 | plt.rcParams['figure.dpi'] = 100 89 | fig, ax = plt.subplots(1, 1) 90 | plot_values(results, 'L_avg', 'C_relative_size', ax=ax, marker=".") 91 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 92 | ax.set_ylabel("$\mathbb{E}[|C_{inner}| / |C^*|]$") 93 | ax.set_xlim(0, 0.95) 94 | ax.set_ylim(0, 4) 95 | ax.set_title(f"C_inner (relative size), with rougeL_threshold = {kwargs['rouge_threshold']}") 96 | plt.show() 97 | 98 | plt.rcParams['figure.dpi'] = 100 99 | fig, ax = plt.subplots(1, 1) 100 | plot_values(results, 'L_avg', 'recall', ax=ax, marker=".") 101 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 102 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| / |C^*|]$") 103 | ax.set_xlim(0, 0.9) 104 | ax.set_ylim(0, 1) 105 | ax.set_title(f"C_inner (recall), with rougeL_threshold = {kwargs['rouge_threshold']}") 106 | plt.show() 107 | 108 | plt.rcParams['figure.dpi'] = 100 109 | fig, ax = plt.subplots(1, 1) 110 | plot_values(results, 'L_avg', 'micro_recall', ax=ax, marker=".") 111 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 112 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*|] / \mathbb{E}[|C^*|]$") 113 | ax.set_xlim(0, 0.9) 114 | ax.set_ylim(0, 1) 115 | ax.set_title(f"C_inner (micro recall), with rougeL_threshold = {kwargs['rouge_threshold']}") 116 | plt.show() 117 | 118 | plt.rcParams['figure.dpi'] = 100 119 | fig, ax = plt.subplots(1, 1) 120 | plot_values(results, 'L_avg', 'fake_recall', ax=ax, marker=".") 121 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 122 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| / |C^*|]$") 123 | ax.set_xlim(0, 0.9) 124 | ax.set_ylim(0, 2) 125 | ax.set_title(f"C_inner (fake recall), with rougeL_threshold = {kwargs['rouge_threshold']}") 126 | plt.show() 127 | 128 | plt.rcParams['figure.dpi'] = 100 129 | fig, ax = plt.subplots(1, 1) 130 | plot_values(results, 'L_avg', 'C_size_conditioned', ax=ax, marker=".") 131 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 132 | ax.set_ylabel("$\mathbb{E}[|C_{inner}| | ok]$") 133 | ax.set_xlim(0, 0.95) 134 | ax.set_ylim(0, 8) 135 | ax.set_title(f"C_inner size conditioned, with rougeL_threshold = {kwargs['rouge_threshold']}") 136 | plt.show() 137 | 138 | plt.rcParams['figure.dpi'] = 100 139 | fig, ax = plt.subplots(1, 1) 140 | plot_values(results, 'L_avg', 'micro_recall_conditioned', ax=ax, marker=".") 141 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 142 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| | ok] / \mathbb{E}[|C^*| | ok ]$") 143 | ax.set_xlim(0, 0.95) 144 | ax.set_ylim(0, 1) 145 | ax.set_title(f"C_inner, with rougeL_threshold = {kwargs['rouge_threshold']}") 146 | plt.show() 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /notebooks/C_inner.py: -------------------------------------------------------------------------------- 1 | # %cd .. 2 | # %load_ext autoreload 3 | # %autoreload 2 4 | 5 | import os 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | from clm.components import ( 10 | compute_values_for_C_inner, 11 | ) 12 | 13 | 14 | base = "data/cxr/val" 15 | 16 | # + 17 | rouge_label = np.load(os.path.join(base, "components", "rouge_label.npy")) 18 | rouge_with_refs = np.load(os.path.join(base, "components", "rouge_with_refs.npy")) 19 | oracle_size = np.load(os.path.join(base, "components", "oracle_size.npy")) 20 | 21 | image_preds = np.load(os.path.join(base, "components", "image_preds.npy")) 22 | probs_preds = np.load(os.path.join(base, "components", "probs_preds.npy")) 23 | random_preds = np.load(os.path.join(base, "components", "random_preds.npy")) 24 | firstk_preds = np.load(os.path.join(base, "components", "firstk_preds.npy")) 25 | 26 | mask = np.load(os.path.join(base, "components", "mask.npy")) 27 | oracle_size = np.load(os.path.join(base, "components", "oracle_size.npy")) 28 | 29 | labels = np.load(os.path.join(base, "labels.npy")) 30 | oracle_subset = labels.any(axis=1) 31 | 32 | # + 33 | kwargs = { 34 | "rouge_threshold": 0.4, 35 | "num_quantiles": 1000, 36 | "subset": oracle_subset, 37 | } 38 | 39 | results = { 40 | 'Image-sentence Scores': compute_values_for_C_inner(image_preds, rouge_with_refs, **kwargs), 41 | 'Prob Scores': compute_values_for_C_inner(probs_preds, rouge_with_refs, **kwargs), 42 | 'Random': compute_values_for_C_inner(random_preds, rouge_with_refs, **kwargs), 43 | 'First-K': compute_values_for_C_inner(firstk_preds, rouge_with_refs, **kwargs), 44 | } 45 | 46 | 47 | # - 48 | 49 | def plot_values(results, x_axis, y_axis, ax=None, **kwargs): 50 | """Plot y_axis vs. x_axis given a dict of {method: [(size, loss), ...]}.""" 51 | if ax is None: 52 | _, ax = plt.subplots(1, 1) 53 | 54 | for method, values in results.items(): 55 | x_values = values[x_axis] 56 | y_values = values[y_axis] 57 | ax.plot(x_values, y_values, label=method, **kwargs) 58 | 59 | ax.set_ylabel(y_axis) 60 | ax.set_xlabel(x_axis) 61 | ax.legend() 62 | return ax 63 | 64 | 65 | plt.rcParams['figure.dpi'] = 100 66 | fig, ax = plt.subplots(1, 1) 67 | plot_values(results, 'L_avg', 'C_size_avg', ax=ax, marker=".") 68 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 69 | ax.set_ylabel("$\mathbb{E}[|C_{inner}|]$") 70 | ax.set_xlim(0, 0.95) 71 | ax.set_ylim(0, 20) 72 | ax.set_title(f"C_inner, with rougeL_threshold = {kwargs['rouge_threshold']}") 73 | plt.show() 74 | 75 | plt.rcParams['figure.dpi'] = 100 76 | fig, ax = plt.subplots(1, 1) 77 | plot_values(results, 'L_avg', 'C_relative_size', ax=ax, marker=".") 78 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 79 | ax.set_ylabel("$\mathbb{E}[|C_{inner}| / |C^*|]$") 80 | ax.set_xlim(0, 0.95) 81 | ax.set_ylim(0, 4) 82 | ax.set_title(f"C_inner (relative size), with rougeL_threshold = {kwargs['rouge_threshold']}") 83 | plt.show() 84 | 85 | plt.rcParams['figure.dpi'] = 100 86 | fig, ax = plt.subplots(1, 1) 87 | plot_values(results, 'L_avg', 'recall', ax=ax, marker=".") 88 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 89 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| / |C^*|]$") 90 | ax.set_xlim(0, 0.9) 91 | ax.set_ylim(0, 1) 92 | ax.set_title(f"C_inner (recall), with rougeL_threshold = {kwargs['rouge_threshold']}") 93 | plt.show() 94 | 95 | plt.rcParams['figure.dpi'] = 100 96 | fig, ax = plt.subplots(1, 1) 97 | plot_values(results, 'L_avg', 'micro_recall', ax=ax, marker=".") 98 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 99 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*|] / \mathbb{E}[|C^*|]$") 100 | ax.set_xlim(0, 0.9) 101 | ax.set_ylim(0, 1) 102 | ax.set_title(f"C_inner (micro recall), with rougeL_threshold = {kwargs['rouge_threshold']}") 103 | plt.show() 104 | 105 | plt.rcParams['figure.dpi'] = 100 106 | fig, ax = plt.subplots(1, 1) 107 | plot_values(results, 'L_avg', 'fake_recall', ax=ax, marker=".") 108 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 109 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| / |C^*|]$") 110 | ax.set_xlim(0, 0.9) 111 | ax.set_ylim(0, 2) 112 | ax.set_title(f"C_inner (fake recall), with rougeL_threshold = {kwargs['rouge_threshold']}") 113 | plt.show() 114 | 115 | plt.rcParams['figure.dpi'] = 100 116 | fig, ax = plt.subplots(1, 1) 117 | plot_values(results, 'L_avg', 'micro_recall_conditioned', ax=ax, marker=".") 118 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 119 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| | ok] / \mathbb{E}[|C^*| | ok ]$") 120 | ax.set_xlim(0, 0.95) 121 | ax.set_ylim(0, 1) 122 | ax.set_title(f"C_inner, with rougeL_threshold = {kwargs['rouge_threshold']}") 123 | plt.show() 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /notebooks/C_outer-dm.py: -------------------------------------------------------------------------------- 1 | # %cd .. 2 | # %load_ext autoreload 3 | # %autoreload 2 4 | 5 | import torch 6 | import os 7 | import numpy as np 8 | import json 9 | from sklearn.metrics import roc_auc_score 10 | from tqdm.auto import tqdm 11 | import matplotlib.pyplot as plt 12 | 13 | from clm.components import ( 14 | get_rouge_with_multi_refs, 15 | get_preds, 16 | get_random_preds, 17 | get_first_k_preds, 18 | compute_values_for_C_outer, 19 | ) 20 | 21 | base = "data/cnndm/val" 22 | 23 | with open(os.path.join(base, "row_rouge.jsonl")) as f: 24 | row_rouge_scores = [json.loads(line) for line in tqdm(f)] 25 | with open(os.path.join(base, "row_generation.jsonl")) as f: 26 | row_generation_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 27 | with open(os.path.join(base, "row_reference.jsonl")) as f: 28 | row_reference_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 29 | 30 | # + 31 | with open(os.path.join(base, "probs_scores.jsonl")) as f: 32 | prob_scores = [json.loads(line) for line in tqdm(f)] 33 | 34 | with open(os.path.join(base, "nli_scores.jsonl")) as f: 35 | nli_scores = [json.loads(line) for line in tqdm(f)] 36 | # - 37 | 38 | labels = np.load(os.path.join(base, "labels.npy")) 39 | oracle_subset = labels.any(axis=1) 40 | 41 | # + 42 | rouge_with_multi_refs = get_rouge_with_multi_refs(row_rouge_scores, row_reference_idx_to_row_idx, row_generation_idx_to_row_idx, labels) 43 | _, _, _, K = rouge_with_multi_refs.shape 44 | 45 | nli_preds = get_preds(nli_scores, row_generation_idx_to_row_idx, K) 46 | probs_preds = get_preds(prob_scores, row_generation_idx_to_row_idx, K) 47 | random_preds = get_random_preds(row_generation_idx_to_row_idx, K) 48 | firstk_preds = get_first_k_preds(row_generation_idx_to_row_idx, K) 49 | 50 | 51 | # + 52 | kwargs = { 53 | "rouge_threshold": 0.2, 54 | "num_quantiles": 100, 55 | "subset": oracle_subset, 56 | } 57 | 58 | # Multi-ref 59 | # results = { 60 | # 'NLI Scores': compute_values_for_C_outer(nli_preds, rouge_with_multi_refs, **kwargs), 61 | # 'Prob Scores': compute_values_for_C_outer(probs_preds, rouge_with_multi_refs, **kwargs), 62 | # 'Random': compute_values_for_C_outer(random_preds, rouge_with_multi_refs, **kwargs), 63 | # 'First-K': compute_values_for_C_outer(firstk_preds, rouge_with_multi_refs, **kwargs), 64 | # } 65 | # Single-ref 66 | results = { 67 | 'NLI Scores': compute_values_for_C_outer(nli_preds, rouge_with_multi_refs[:1], **kwargs), 68 | 'Prob Scores': compute_values_for_C_outer(probs_preds, rouge_with_multi_refs[:1], **kwargs), 69 | 'Random': compute_values_for_C_outer(random_preds, rouge_with_multi_refs[:1], **kwargs), 70 | 'First-K': compute_values_for_C_outer(firstk_preds, rouge_with_multi_refs[:1], **kwargs), 71 | } 72 | 73 | 74 | 75 | # - 76 | 77 | def plot_values(results, x_axis, y_axis, ax=None, **kwargs): 78 | """Plot y_axis vs. x_axis given a dict of {method: [(size, loss), ...]}.""" 79 | if ax is None: 80 | _, ax = plt.subplots(1, 1) 81 | 82 | for method, values in results.items(): 83 | empty = values["C_size_avg"] == 0 84 | x_values = values[x_axis][~empty] 85 | y_values = values[y_axis][~empty] 86 | ax.plot(x_values, y_values, label=method, **kwargs) 87 | 88 | ax.set_ylabel(y_axis) 89 | ax.set_xlabel(x_axis) 90 | ax.legend() 91 | return ax 92 | 93 | 94 | plt.rcParams['figure.dpi'] = 100 95 | fig, ax = plt.subplots(1, 1) 96 | plot_values(results, 'L_avg', 'C_size_avg', ax=ax, marker=".") 97 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 98 | ax.set_ylabel("$\mathbb{E}[|C_{outer}|]$") 99 | # ax.set_xlim(0, 0.9) 100 | ax.set_ylim(0, 100) 101 | ax.set_title(f"C_outer, with rougeL_threshold = {kwargs['rouge_threshold']}") 102 | plt.show() 103 | 104 | plt.rcParams['figure.dpi'] = 100 105 | fig, ax = plt.subplots(1, 1) 106 | plot_values(results, 'L_avg', 'C_relative_size', ax=ax, marker=".") 107 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 108 | ax.set_ylabel("$\mathbb{E}[|C_{outer}|]$") 109 | # ax.set_xlim(0, 0.9) 110 | # ax.set_ylim(0, 100) 111 | ax.set_title(f"C_outer (relative size), with rougeL_threshold = {kwargs['rouge_threshold']}") 112 | plt.show() 113 | 114 | plt.rcParams['figure.dpi'] = 100 115 | fig, ax = plt.subplots(1, 1) 116 | plot_values(results, 'L_avg', 'precision', ax=ax, marker=".") 117 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 118 | ax.set_ylabel("$\mathbb{E}[ |C_{outer}\cap C^*|/ |C_{outer}|]$") 119 | # ax.set_xlim(0.25, 1) 120 | # ax.set_ylim(0, 1) 121 | ax.set_title(f"C_outer (precision), with rougeL_threshold = {kwargs['rouge_threshold']}") 122 | plt.show() 123 | 124 | plt.rcParams['figure.dpi'] = 100 125 | fig, ax = plt.subplots(1, 1) 126 | plot_values(results, 'L_avg', 'micro_precision', ax=ax, marker=".") 127 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 128 | ax.set_ylabel("$\mathbb{E}[ |C_{outer}\cap C^*|]/ \mathbb{E}[|C_{outer}|]$") 129 | # ax.set_xlim(0.25, 1) 130 | # ax.set_ylim(0, 1) 131 | ax.set_title(f"C_outer (micro_precision), with rougeL_threshold = {kwargs['rouge_threshold']}") 132 | plt.show() 133 | 134 | plt.rcParams['figure.dpi'] = 100 135 | fig, ax = plt.subplots(1, 1) 136 | plot_values(results, 'L_avg', 'micro_precision_conditioned', ax=ax, marker=".") 137 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 138 | ax.set_ylabel("$\mathbb{E}[ |C_{outer}\cap C^*| | ok ]/ \mathbb{E}[|C_{outer}| | ok ]$") 139 | # ax.set_xlim(0.25, 1) 140 | # ax.set_ylim(0, 1) 141 | ax.set_title(f"C_outer (micro_precision conditioned), with rougeL_threshold = {kwargs['rouge_threshold']}") 142 | plt.show() 143 | 144 | 145 | 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /notebooks/C_outer.py: -------------------------------------------------------------------------------- 1 | # %cd .. 2 | # %load_ext autoreload 3 | # %autoreload 2 4 | 5 | import torch 6 | import os 7 | import numpy as np 8 | import json 9 | from sklearn.metrics import roc_auc_score 10 | from tqdm.auto import tqdm 11 | import matplotlib.pyplot as plt 12 | 13 | from clm.components import ( 14 | get_rouge_with_multi_refs, 15 | get_preds, 16 | get_random_preds, 17 | get_first_k_preds, 18 | compute_values_for_C_outer, 19 | ) 20 | 21 | base = "data/cxr/val" 22 | 23 | with open(os.path.join(base, "row_rouge.jsonl")) as f: 24 | row_rouge_scores = [json.loads(line) for line in tqdm(f)] 25 | with open(os.path.join(base, "row_generation.jsonl")) as f: 26 | row_generation_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 27 | with open(os.path.join(base, "row_reference.jsonl")) as f: 28 | row_reference_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 29 | 30 | # + 31 | with open(os.path.join(base, "image_component_scores.jsonl")) as f: 32 | image_scores = [json.loads(line) for line in tqdm(f)] 33 | 34 | with open(os.path.join(base, "normprob_component_scores.jsonl")) as f: 35 | prob_scores = [json.loads(line) for line in tqdm(f)] 36 | # - 37 | 38 | labels = np.load(os.path.join(base, "labels.npy")) 39 | oracle_subset = labels.any(axis=1) 40 | 41 | # + 42 | rouge_with_multi_refs = get_rouge_with_multi_refs(row_rouge_scores, row_reference_idx_to_row_idx, row_generation_idx_to_row_idx, labels) 43 | _, _, _, K = rouge_with_multi_refs.shape 44 | 45 | image_preds = get_preds(image_scores, row_generation_idx_to_row_idx, K) 46 | probs_preds = get_preds(prob_scores, row_generation_idx_to_row_idx, K) 47 | random_preds = get_random_preds(row_generation_idx_to_row_idx, K) 48 | firstk_preds = get_first_k_preds(row_generation_idx_to_row_idx, K) 49 | 50 | # + 51 | kwargs = { 52 | "rouge_threshold": 0.4, 53 | "num_quantiles": 100, 54 | "subset": oracle_subset, 55 | } 56 | 57 | # Multi-ref 58 | # results = { 59 | # 'Image-sentence Scores': compute_values_for_C_outer(image_preds, rouge_with_multi_refs, **kwargs), 60 | # 'Prob Scores': compute_values_for_C_outer(probs_preds, rouge_with_multi_refs, **kwargs), 61 | # 'Random': compute_values_for_C_outer(random_preds, rouge_with_multi_refs, **kwargs), 62 | # 'First-K': compute_values_for_C_outer(firstk_preds, rouge_with_multi_refs, **kwargs), 63 | # } 64 | 65 | # Single-ref 66 | results = { 67 | 'Image-sentence Scores': compute_values_for_C_outer(image_preds, rouge_with_multi_refs[:1], **kwargs), 68 | 'Prob Scores': compute_values_for_C_outer(probs_preds, rouge_with_multi_refs[:1], **kwargs), 69 | 'Random': compute_values_for_C_outer(random_preds, rouge_with_multi_refs[:1], **kwargs), 70 | 'First-K': compute_values_for_C_outer(firstk_preds, rouge_with_multi_refs[:1], **kwargs), 71 | } 72 | 73 | 74 | # - 75 | 76 | def plot_values(results, x_axis, y_axis, ax=None, **kwargs): 77 | """Plot y_axis vs. x_axis given a dict of {method: [(size, loss), ...]}.""" 78 | if ax is None: 79 | _, ax = plt.subplots(1, 1) 80 | 81 | for method, values in results.items(): 82 | empty = values["C_size_avg"] == 0 83 | x_values = values[x_axis][~empty] 84 | y_values = values[y_axis][~empty] 85 | ax.plot(x_values, y_values, label=method, **kwargs) 86 | 87 | ax.set_ylabel(y_axis) 88 | ax.set_xlabel(x_axis) 89 | ax.legend() 90 | return ax 91 | 92 | 93 | plt.rcParams['figure.dpi'] = 100 94 | fig, ax = plt.subplots(1, 1) 95 | plot_values(results, 'L_avg', 'C_size_avg', ax=ax, marker=".") 96 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 97 | ax.set_ylabel("$\mathbb{E}[|C_{outer}|]$") 98 | # ax.set_xlim(0, 0.9) 99 | ax.set_ylim(0, 100) 100 | ax.set_title(f"C_outer, with rougeL_threshold = {kwargs['rouge_threshold']}") 101 | plt.show() 102 | 103 | plt.rcParams['figure.dpi'] = 100 104 | fig, ax = plt.subplots(1, 1) 105 | plot_values(results, 'L_avg', 'C_relative_size', ax=ax, marker=".") 106 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 107 | ax.set_ylabel("$\mathbb{E}[|C_{outer}|]$") 108 | # ax.set_xlim(0, 0.9) 109 | # ax.set_ylim(0, 100) 110 | ax.set_title(f"C_outer (relative size), with rougeL_threshold = {kwargs['rouge_threshold']}") 111 | plt.show() 112 | 113 | plt.rcParams['figure.dpi'] = 100 114 | fig, ax = plt.subplots(1, 1) 115 | plot_values(results, 'L_avg', 'precision', ax=ax, marker=".") 116 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 117 | ax.set_ylabel("$\mathbb{E}[ |C_{outer}\cap C^*|/ |C_{outer}|]$") 118 | # ax.set_xlim(0.25, 1) 119 | # ax.set_ylim(0, 1) 120 | ax.set_title(f"C_outer (precision), with rougeL_threshold = {kwargs['rouge_threshold']}") 121 | plt.show() 122 | 123 | plt.rcParams['figure.dpi'] = 100 124 | fig, ax = plt.subplots(1, 1) 125 | plot_values(results, 'L_avg', 'micro_precision', ax=ax, marker=".") 126 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 127 | ax.set_ylabel("$\mathbb{E}[ |C_{outer}\cap C^*|]/ \mathbb{E}[|C_{outer}|]$") 128 | # ax.set_xlim(0.25, 1) 129 | # ax.set_ylim(0, 1) 130 | ax.set_title(f"C_outer (micro_precision), with rougeL_threshold = {kwargs['rouge_threshold']}") 131 | plt.show() 132 | 133 | plt.rcParams['figure.dpi'] = 100 134 | fig, ax = plt.subplots(1, 1) 135 | plot_values(results, 'L_avg', 'micro_precision_conditioned', ax=ax, marker=".") 136 | ax.set_xlabel("$1 - \mathbb{P}(C^* \subset C_{outer})$") 137 | ax.set_ylabel("$\mathbb{E}[ |C_{outer}\cap C^*| | ok ]/ \mathbb{E}[|C_{outer}| | ok ]$") 138 | # ax.set_xlim(0.25, 1) 139 | # ax.set_ylim(0, 1) 140 | ax.set_title(f"C_outer (micro_precision conditioned), with rougeL_threshold = {kwargs['rouge_threshold']}") 141 | plt.show() 142 | 143 | 144 | -------------------------------------------------------------------------------- /notebooks/qualitative.py: -------------------------------------------------------------------------------- 1 | # %cd .. 2 | # %load_ext autoreload 3 | # %autoreload 2 4 | 5 | import functools 6 | import os 7 | import numpy as np 8 | from clm import utils, uncertainty 9 | import tqdm 10 | from scipy.stats import binom 11 | import collections 12 | 13 | 14 | methods = [ 15 | dict(scaling=('none', {}), scoring='first_k', rejection=False), 16 | dict(scaling=('none', {}), scoring='max', rejection=False), 17 | dict(scaling=('none', {}), scoring='max', rejection=True), 18 | dict(scaling=('none', {}), scoring='sum', rejection=True), 19 | dict(scaling=('platt', {}), scoring='geo', rejection=False), 20 | dict(scaling=('platt', {}), scoring='geo', rejection=True), 21 | ] 22 | 23 | 24 | risk = 0.7 25 | 26 | base = "data/cxr" 27 | splits = np.load(os.path.join(base, "splits.npz")) 28 | 29 | train_scores = np.load(os.path.join(base, "train", "probs.npy")) 30 | train_labels = np.load(os.path.join(base, "train", "labels.npy")) 31 | train_similarity = np.load(os.path.join(base, "train", "diversity_rouge.npy")) 32 | train_data = utils.Dataset(train_scores, train_similarity, train_labels) 33 | 34 | test_scores = np.load(os.path.join(base, "test", "probs.npy")) 35 | test_labels = np.load(os.path.join(base, "test", "labels.npy")) 36 | test_similarity = np.load(os.path.join(base, "test", "diversity_rouge.npy")) 37 | test_data = utils.Dataset(test_scores, test_similarity, test_labels) 38 | 39 | all_item_scores = [] 40 | for method in tqdm.tqdm(methods, desc='scaling'): 41 | scale_type, scale_kwargs = method.get('scaling', ('none', {})) 42 | scaler = uncertainty.NAME_TO_SCALER[scale_type](**scale_kwargs) 43 | scaler.fit(train_data.scores, train_data.labels) 44 | all_item_scores.append(scaler.predict(test_data.scores)) 45 | 46 | delta=0.05, 47 | epsilons = uncertainty.DEFAULT_EPSILONS 48 | p_cal = 0.3 49 | 50 | N = len(test_data.scores) 51 | N_cal = int(p_cal * N) 52 | N_pcal = int(0.7 * N_cal) 53 | splits = [] 54 | randperm = np.random.permutation(N) 55 | pareto_idx, test_idx = randperm[:N_cal], randperm[N_cal:] 56 | cal_idx, opt_idx = pareto_idx[:N_pcal], pareto_idx[N_pcal:] 57 | 58 | 59 | # Compute results for different methods. 60 | all_trial_results = [] 61 | for i, item_scores in enumerate(all_item_scores): 62 | opt_item_scores = item_scores[opt_idx] 63 | opt_similarity_scores = test_data.similarity[opt_idx] 64 | opt_item_labels = test_data.labels[opt_idx] 65 | 66 | cal_item_scores = item_scores[cal_idx] 67 | cal_similarity_scores = test_data.similarity[cal_idx] 68 | cal_item_labels = test_data.labels[cal_idx] 69 | 70 | test_item_scores = item_scores[test_idx] 71 | test_similarity_scores = test_data.similarity[test_idx] 72 | test_item_labels = test_data.labels[test_idx] 73 | 74 | # Get scoring mechanism for this method. 75 | set_score_fn = uncertainty.NAME_TO_SCORE[methods[i].get('scoring', 'none')] 76 | do_rejection = methods[i].get('rejection', False) 77 | 78 | # Get pareto frontier for Pareto Testing. 79 | configs = uncertainty.get_pareto_frontier( 80 | item_scores=opt_item_scores, 81 | similarity_scores=opt_similarity_scores, 82 | item_labels=opt_item_labels, 83 | set_score_fn=set_score_fn, 84 | do_rejection=do_rejection) 85 | 86 | # Choose best valid configs (configs are already ordered). 87 | best_valid_configs = [[np.nan] * 3] * len(epsilons) 88 | is_stopped = [False] * len(epsilons) 89 | for config in configs: 90 | values = uncertainty.compute_values( 91 | config=config, 92 | item_scores=cal_item_scores, 93 | similarity_scores=cal_similarity_scores, 94 | item_labels=cal_item_labels, 95 | set_score_fn=set_score_fn, 96 | do_rejection=do_rejection) 97 | 98 | for j, epsilon in enumerate(epsilons): 99 | loss = values['L_avg'] 100 | n = len(cal_idx) 101 | p_value = binom.cdf(n * loss, n, epsilon) 102 | if p_value <= delta and not is_stopped[j]: 103 | best_valid_configs[j] = config 104 | else: 105 | is_stopped[j] = True 106 | 107 | # Compute test metrics. 108 | trial_results = collections.defaultdict(list) 109 | trial_results['configs'] = np.array(best_valid_configs) 110 | for j, config in enumerate(best_valid_configs): 111 | values = uncertainty.compute_values( 112 | config=config, 113 | item_scores=test_item_scores, 114 | similarity_scores=test_similarity_scores, 115 | item_labels=test_item_labels, 116 | set_score_fn=set_score_fn, 117 | do_rejection=do_rejection) 118 | for k, v in values.items(): 119 | trial_results[k].append(v) 120 | all_trial_results.append(trial_results) 121 | 122 | 123 | len(all_trial_results) 124 | 125 | 126 | -------------------------------------------------------------------------------- /notebooks/run_one_trial.py: -------------------------------------------------------------------------------- 1 | # %cd .. 2 | # %load_ext autoreload 3 | # %autoreload 2 4 | 5 | import itertools 6 | import functools 7 | import os 8 | import numpy as np 9 | from clm import utils, uncertainty 10 | import tqdm 11 | from scipy.stats import binom 12 | import collections 13 | import p_tqdm 14 | 15 | method = dict(scaling=('none', {}), scoring='sum', rejection=True) 16 | 17 | risk = 0.7 18 | 19 | base = "data/cxr" 20 | splits = np.load(os.path.join(base, "splits.npz")) 21 | 22 | train_scores = np.load(os.path.join(base, "train", "probs.npy")) 23 | train_labels = np.load(os.path.join(base, "train", "labels.npy")) 24 | train_similarity = np.load(os.path.join(base, "train", "diversity_rouge.npy")) 25 | train_data = utils.Dataset(train_scores, train_similarity, train_labels) 26 | 27 | test_scores = np.load(os.path.join(base, "test", "probs.npy")) 28 | test_labels = np.load(os.path.join(base, "test", "labels.npy")) 29 | test_similarity = np.load(os.path.join(base, "test", "diversity_rouge.npy")) 30 | test_data = utils.Dataset(test_scores, test_similarity, test_labels) 31 | 32 | scale_type, scale_kwargs = method.get('scaling', ('none', {})) 33 | scaler = uncertainty.NAME_TO_SCALER[scale_type](**scale_kwargs) 34 | scaler.fit(train_data.scores, train_data.labels) 35 | item_scores = (scaler.predict(test_data.scores)) 36 | 37 | delta=0.05, 38 | epsilons = uncertainty.DEFAULT_EPSILONS 39 | p_cal = 0.3 40 | 41 | N = len(test_data.scores) 42 | N_cal = int(p_cal * N) 43 | N_pcal = int(0.7 * N_cal) 44 | splits = [] 45 | randperm = np.random.permutation(N) 46 | pareto_idx, test_idx = randperm[:N_cal], randperm[N_cal:] 47 | cal_idx, opt_idx = pareto_idx[:N_pcal], pareto_idx[N_pcal:] 48 | 49 | opt_item_scores = item_scores[opt_idx] 50 | opt_similarity_scores = test_data.similarity[opt_idx] 51 | opt_item_labels = test_data.labels[opt_idx] 52 | 53 | cal_item_scores = item_scores[cal_idx] 54 | cal_similarity_scores = test_data.similarity[cal_idx] 55 | cal_item_labels = test_data.labels[cal_idx] 56 | 57 | test_item_scores = item_scores[test_idx] 58 | test_similarity_scores = test_data.similarity[test_idx] 59 | test_item_labels = test_data.labels[test_idx] 60 | 61 | # Get scoring mechanism for this method. 62 | set_score_fn = uncertainty.NAME_TO_SCORE[method.get('scoring', 'none')] 63 | do_rejection = method.get('rejection', False) 64 | 65 | # Get pareto frontier for Pareto Testing. 66 | 67 | lambda_1 = utils.select_lambdas(opt_similarity_scores, max_lambdas=25) 68 | lambda_2 = utils.select_lambdas(opt_item_scores, max_lambdas=25) 69 | lambda_3 = utils.select_lambdas(set_score_fn(opt_item_scores), max_lambdas=25) 70 | 71 | def get_costs_for_lambdas(config): 72 | values = uncertainty.compute_values( 73 | config=config, 74 | item_scores=opt_item_scores, 75 | similarity_scores=opt_similarity_scores, 76 | item_labels=opt_item_labels, 77 | set_score_fn=set_score_fn, 78 | do_rejection=do_rejection) 79 | return (values['L_avg'], values['C_obj_avg']) 80 | 81 | configs = [] 82 | for config in (itertools.product(lambda_1, lambda_2, lambda_3)): 83 | configs.append(config) 84 | 85 | costs = p_tqdm.p_map(get_costs_for_lambdas, configs) 86 | 87 | configs = np.array(configs) 88 | costs = np.array(costs) 89 | 90 | is_efficient = np.ones(costs.shape[0], dtype=bool) 91 | for i, c in enumerate(costs): 92 | if is_efficient[i]: 93 | is_efficient[is_efficient] = np.any(costs[is_efficient] < c, axis=1) 94 | is_efficient[i] = True 95 | 96 | pareto_configs = configs[is_efficient] 97 | pareto_costs = costs[is_efficient] 98 | sort_idx = np.argsort(pareto_costs[:, 0]) 99 | pareto_costs = pareto_costs[sort_idx] 100 | ordered_configs = pareto_configs[sort_idx] 101 | 102 | configs = ordered_configs 103 | 104 | # Choose best valid configs (configs are already ordered). 105 | best_valid_configs = [[np.nan] * 3] * len(epsilons) 106 | is_stopped = [False] * len(epsilons) 107 | for config in configs: 108 | values = uncertainty.compute_values( 109 | config=config, 110 | item_scores=cal_item_scores, 111 | similarity_scores=cal_similarity_scores, 112 | item_labels=cal_item_labels, 113 | set_score_fn=set_score_fn, 114 | do_rejection=do_rejection) 115 | 116 | for j, epsilon in enumerate(epsilons): 117 | loss = values['L_avg'] 118 | n = len(cal_idx) 119 | p_value = binom.cdf(n * loss, n, epsilon) 120 | if p_value <= delta and not is_stopped[j]: 121 | best_valid_configs[j] = config 122 | else: 123 | is_stopped[j] = True 124 | 125 | 126 | output = dict( 127 | method=method, 128 | epsilons=epsilons, 129 | configs=np.array(best_valid_configs), 130 | cal_idx=cal_idx, 131 | opt_idx=opt_idx, 132 | test_idx=test_idx, 133 | ) 134 | 135 | np.savez("/tmp/results_fast.npz", **output) 136 | -------------------------------------------------------------------------------- /scripts/cnndm_components.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from tqdm.auto import tqdm 5 | 6 | from clm.components import ( 7 | get_oracle_size_for_C_inner, 8 | get_rouge_label_for_C_inner, 9 | get_rouge_with_refs, 10 | get_preds, 11 | get_random_preds, 12 | get_first_k_preds, 13 | ) 14 | 15 | for split in ["train", "val", "test"]: 16 | base = f"data/cnndm/{split}" 17 | out_dir = f"data/cnndm/{split}/components" 18 | 19 | with open(os.path.join(base, "row_rouge.jsonl")) as f: 20 | row_rouge_scores = [json.loads(line) for line in tqdm(f)] 21 | with open(os.path.join(base, "row_generation.jsonl")) as f: 22 | row_generation_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 23 | with open(os.path.join(base, "row_reference.jsonl")) as f: 24 | row_reference_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 25 | with open(os.path.join(base, "probs_scores.jsonl")) as f: 26 | prob_scores = [json.loads(line) for line in tqdm(f)] 27 | with open(os.path.join(base, "nli_scores.jsonl")) as f: 28 | nli_scores = [json.loads(line) for line in tqdm(f)] 29 | 30 | 31 | rouge_with_refs = get_rouge_with_refs(row_rouge_scores, row_reference_idx_to_row_idx) 32 | rouge_label = get_rouge_label_for_C_inner(rouge_with_refs) 33 | oracle_size = get_oracle_size_for_C_inner(rouge_with_refs) 34 | _, _, K = rouge_with_refs.shape 35 | nli_preds = get_preds(nli_scores, row_generation_idx_to_row_idx, K) 36 | probs_preds = get_preds(prob_scores, row_generation_idx_to_row_idx, K) 37 | random_preds = get_random_preds(row_generation_idx_to_row_idx, K) 38 | firstk_preds = get_first_k_preds(row_generation_idx_to_row_idx, K) 39 | 40 | mask = firstk_preds != -1 41 | assert (mask == (nli_preds != -1)).all() 42 | assert (mask == (probs_preds != -1)).all() 43 | assert (mask == (random_preds != -1)).all() 44 | 45 | os.makedirs(out_dir, exist_ok=True) 46 | np.save(os.path.join(out_dir, "rouge_with_refs.npy"), rouge_with_refs) 47 | np.save(os.path.join(out_dir, "rouge_label.npy"), rouge_label) 48 | np.save(os.path.join(out_dir, "oracle_size.npy"), oracle_size) 49 | np.save(os.path.join(out_dir, "nli_preds.npy"), nli_preds) 50 | np.save(os.path.join(out_dir, "probs_preds.npy"), probs_preds) 51 | np.save(os.path.join(out_dir, "random_preds.npy"), random_preds) 52 | np.save(os.path.join(out_dir, "firstk_preds.npy"), firstk_preds) 53 | np.save(os.path.join(out_dir, "mask.npy"), mask) 54 | -------------------------------------------------------------------------------- /scripts/cnndm_data.py: -------------------------------------------------------------------------------- 1 | """Split CNN/DM data.""" 2 | 3 | import argparse 4 | import os 5 | import json 6 | import numpy as np 7 | 8 | INPUT_DIR = '/Mounts/rbg-storage1/users/quach/outputs/uncertainty/xl_topp095_temp07' 9 | OUTPUT_DIR = 'data/cnndm' 10 | 11 | def read_jsonl(path, g_shuffle=None): 12 | """When g_shuffle is not None, permute along axis=1""" 13 | 14 | with open(path) as f: 15 | raw_data = [json.loads(line) for line in f] 16 | 17 | if g_shuffle is None: 18 | return raw_data 19 | 20 | data = [] 21 | for i, d in enumerate(raw_data): 22 | row = [] 23 | for j in range(len(d)): 24 | row.append(d[g_shuffle[i][j]]) 25 | data.append(row) 26 | 27 | return data 28 | 29 | def save_jsonl(dirname, name, data, indices): 30 | with open(os.path.join(dirname, name), 'w') as f: 31 | for i in indices: 32 | f.write(json.dumps(data[i]) + '\n') 33 | 34 | 35 | def main(args): 36 | np.random.seed(0) 37 | all_losses = np.load( 38 | os.path.join(args.input_dir, 'losses.npy'), allow_pickle=True) 39 | all_labels = all_losses <= args.loss_threshold 40 | all_probs = np.load( 41 | os.path.join(args.input_dir, 'scores.npy'), allow_pickle=True) 42 | all_diversity = np.load( 43 | os.path.join(args.input_dir, 'diversity_rouge_scores.npy'), allow_pickle=True) 44 | 45 | 46 | all_row_rouge = read_jsonl(os.path.join(args.input_dir, 'rouge_scores', 'row_rouge_scores.jsonl')) 47 | all_row_reference = read_jsonl(os.path.join(args.input_dir, 'rouge_scores', 'row_reference_idx_to_row_idx.jsonl')) 48 | 49 | # Shuffle examples. 50 | e_shuffle = np.random.permutation(len(all_labels)) 51 | splits = { 52 | 'train': e_shuffle[:args.num_train], 53 | 'val': e_shuffle[args.num_train:args.num_train + args.num_val], 54 | 'test': e_shuffle[args.num_train + args.num_val:], 55 | } 56 | 57 | # Shuffle generations. 58 | g_shuffle = np.ones_like(all_labels).cumsum(axis=-1) - 1 59 | g_shuffle = g_shuffle.astype('int') 60 | g_shuffle = np.apply_along_axis( 61 | np.random.permutation, axis=1, arr=g_shuffle) 62 | 63 | all_labels = np.take_along_axis(all_labels, g_shuffle, axis=1) 64 | all_probs = np.take_along_axis(all_probs, g_shuffle, axis=1) 65 | 66 | # Permute in both dimensions. 67 | all_diversity = np.take_along_axis(all_diversity, g_shuffle[:, :, np.newaxis], axis=1) 68 | all_diversity = np.take_along_axis(all_diversity, g_shuffle[:, np.newaxis, :], axis=2) 69 | 70 | # Read and permute 71 | all_row_generation = read_jsonl(os.path.join(args.input_dir, 'rouge_scores', 'row_generation_idx_to_row_idx.jsonl'), g_shuffle) 72 | 73 | all_probs_scores = read_jsonl(os.path.join(args.input_dir, 'components', 'probs.jsonl'), g_shuffle) 74 | all_nli_scores = read_jsonl(os.path.join(args.input_dir, 'components', 'nli_nocontext.jsonl'), g_shuffle) 75 | 76 | os.makedirs(args.output_dir, exist_ok=True) 77 | np.save(os.path.join(args.output_dir, 'generation_idx.npy'), g_shuffle) 78 | np.savez(os.path.join(args.output_dir, 'splits.npz'), 79 | train=splits['train'], val=splits['val'], test=splits['test']) 80 | for split, idx in splits.items(): 81 | dirname = os.path.join(args.output_dir, split) 82 | os.makedirs(dirname, exist_ok=True) 83 | np.save(os.path.join(dirname, 'labels.npy'), all_labels[idx]) 84 | np.save(os.path.join(dirname, 'probs.npy'), all_probs[idx]) 85 | np.save(os.path.join(dirname, 'diversity.npy'), all_diversity[idx]) 86 | 87 | save_jsonl(dirname, 'row_rouge.jsonl', all_row_rouge, idx) 88 | save_jsonl(dirname, 'row_generation.jsonl', all_row_generation, idx) 89 | save_jsonl(dirname, 'row_reference.jsonl', all_row_reference, idx) 90 | save_jsonl(dirname, 'probs_scores.jsonl', all_probs_scores, idx) 91 | save_jsonl(dirname, 'nli_scores.jsonl', all_nli_scores, idx) 92 | 93 | 94 | if __name__ == '__main__': 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument('--input_dir', type=str, default=INPUT_DIR) 97 | parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR) 98 | parser.add_argument('--loss_threshold', type=float, default=0.65) 99 | parser.add_argument('--num_train', type=int, default=2000) 100 | parser.add_argument('--num_val', type=int, default=2000) 101 | args = parser.parse_args() 102 | main(args) 103 | -------------------------------------------------------------------------------- /scripts/cnndm_intra_rouge.py: -------------------------------------------------------------------------------- 1 | from p_tqdm import p_map 2 | 3 | import os 4 | import json 5 | import argparse 6 | import numpy as np 7 | from tqdm.auto import tqdm 8 | from rouge_score import rouge_scorer 9 | 10 | INPUT_DIR = '/Mounts/rbg-storage1/users/quach/outputs/uncertainty/xl_topp095_temp07' 11 | FILENAME = 'cnn_dailymail_v002-predict_with_aux_with_sent_splits_and_scores_and_nli.jsonl' 12 | 13 | parser = argparse.ArgumentParser(description='Compute ROUGE scores for sentences') 14 | parser.add_argument('--input_dir', type=str, default=INPUT_DIR) 15 | parser.add_argument('--filename', type=str, default=FILENAME) 16 | args = parser.parse_args() 17 | 18 | 19 | scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) 20 | 21 | with open(os.path.join(args.input_dir, args.filename)) as f: 22 | # Must be global for p_map 23 | samples = [json.loads(line) for line in tqdm(f)] 24 | 25 | num_samples = len(samples) 26 | num_predictions = len(samples[0]['prediction']) 27 | 28 | print(f'num_samples: {num_samples}') 29 | print(f'num_predictions: {num_predictions}') 30 | 31 | def compute_rouge_scores(i): 32 | arr = np.zeros((num_predictions, num_predictions)) 33 | for k1 in range(num_predictions): 34 | for k2 in range(k1, num_predictions): 35 | score = scorer.score(samples[i]['prediction'][k1], samples[i]['prediction'][k2]) 36 | arr[k1, k2] = score['rougeL'].fmeasure 37 | arr[k2, k1] = arr[k1, k2] 38 | 39 | return arr 40 | 41 | all_scores = p_map(compute_rouge_scores, range(num_samples)) 42 | 43 | all_scores = np.array(all_scores) 44 | 45 | np.save(os.path.join(args.input_dir, "diversity_rouge_scores.npy"), all_scores) 46 | 47 | -------------------------------------------------------------------------------- /scripts/cxr_components.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from tqdm.auto import tqdm 5 | 6 | from clm.components import ( 7 | get_oracle_size_for_C_inner, 8 | get_rouge_label_for_C_inner, 9 | get_rouge_with_refs, 10 | get_preds, 11 | get_random_preds, 12 | get_first_k_preds, 13 | ) 14 | 15 | for split in ["train", "val", "test"]: 16 | base = f"data/cxr/{split}" 17 | out_dir = f"data/cxr/{split}/components" 18 | 19 | with open(os.path.join(base, "row_rouge.jsonl")) as f: 20 | row_rouge_scores = [json.loads(line) for line in tqdm(f)] 21 | with open(os.path.join(base, "row_generation.jsonl")) as f: 22 | row_generation_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 23 | with open(os.path.join(base, "row_reference.jsonl")) as f: 24 | row_reference_idx_to_row_idx = [json.loads(line) for line in tqdm(f)] 25 | with open(os.path.join(base, "image_component_scores.jsonl")) as f: 26 | image_scores = [json.loads(line) for line in tqdm(f)] 27 | with open(os.path.join(base, "normprob_component_scores.jsonl")) as f: 28 | prob_scores = [json.loads(line) for line in tqdm(f)] 29 | 30 | 31 | rouge_with_refs = get_rouge_with_refs(row_rouge_scores, row_reference_idx_to_row_idx) 32 | rouge_label = get_rouge_label_for_C_inner(rouge_with_refs) 33 | oracle_size = get_oracle_size_for_C_inner(rouge_with_refs) 34 | _, _, K = rouge_with_refs.shape 35 | 36 | image_preds = get_preds(image_scores, row_generation_idx_to_row_idx, K) 37 | probs_preds = get_preds(prob_scores, row_generation_idx_to_row_idx, K) 38 | random_preds = get_random_preds(row_generation_idx_to_row_idx, K) 39 | firstk_preds = get_first_k_preds(row_generation_idx_to_row_idx, K) 40 | 41 | mask = firstk_preds != -1 42 | assert (mask == (image_preds != -1)).all() 43 | assert (mask == (probs_preds != -1)).all() 44 | assert (mask == (random_preds != -1)).all() 45 | 46 | os.makedirs(out_dir, exist_ok=True) 47 | np.save(os.path.join(out_dir, "rouge_with_refs.npy"), rouge_with_refs) 48 | np.save(os.path.join(out_dir, "rouge_label.npy"), rouge_label) 49 | np.save(os.path.join(out_dir, "oracle_size.npy"), oracle_size) 50 | np.save(os.path.join(out_dir, "image_preds.npy"), image_preds) 51 | np.save(os.path.join(out_dir, "probs_preds.npy"), probs_preds) 52 | np.save(os.path.join(out_dir, "random_preds.npy"), random_preds) 53 | np.save(os.path.join(out_dir, "firstk_preds.npy"), firstk_preds) 54 | np.save(os.path.join(out_dir, "mask.npy"), mask) 55 | -------------------------------------------------------------------------------- /scripts/cxr_data.py: -------------------------------------------------------------------------------- 1 | """Split CXR data.""" 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | import json 7 | 8 | INPUT_DIR = '/Mounts/rbg-storage1/users/quach/outputs/uncertainty/cxr2' 9 | OUTPUT_DIR = 'data/cxr' 10 | PREFIXES = [ 11 | 'calibration_2000_5000', 12 | 'valid_5000_8000', 13 | 'test_8000_13000', 14 | 'valid_scorer_13000_14000', 15 | ] 16 | 17 | 18 | def load_all(dirname, suffix): 19 | combined = [] 20 | for prefix in PREFIXES: 21 | filename = f'{prefix}_{suffix}.npy' 22 | combined.append(np.load(os.path.join(dirname, filename))) 23 | return np.concatenate(combined, axis=0) 24 | 25 | def load_diversity(dirname, name): 26 | combined = [] 27 | for prefix in PREFIXES: 28 | filename = f'diversity/{prefix}/{name}.npy' 29 | combined.append(np.load(os.path.join(dirname, filename))) 30 | return np.concatenate(combined, axis=0) 31 | 32 | def load_rouge(dirname, name): 33 | combined = [] 34 | for prefix in PREFIXES: 35 | filename = f'components/{prefix}/rouge_scores_42_62/{name}.jsonl' 36 | with open(os.path.join(dirname, filename)) as f: 37 | combined.extend([json.loads(line) for line in f]) 38 | return combined 39 | 40 | def load_component_scores(dirname, name): 41 | combined = [] 42 | for prefix in PREFIXES: 43 | filename = f'components/{prefix}/{name}.jsonl' 44 | with open(os.path.join(dirname, filename)) as f: 45 | combined.extend([json.loads(line) for line in f]) 46 | return combined 47 | 48 | def save_jsonl(dirname, name, data, indices): 49 | with open(os.path.join(dirname, name), 'w') as f: 50 | for i in indices: 51 | f.write(json.dumps(data[i]) + '\n') 52 | 53 | 54 | def main(args): 55 | np.random.seed(0) 56 | print("Loading data...") 57 | all_labels = load_all(args.input_dir, 'soft') <= args.loss_threshold 58 | all_probs = load_all(args.input_dir, 'normprob_scores') 59 | all_image = load_all(args.input_dir, 'image-report_scores') 60 | all_text = load_all(args.input_dir, 'text_scores') 61 | all_gnn = load_all(args.input_dir, 'gnn_scores') 62 | 63 | all_diversity_chexbert = load_diversity(args.input_dir, "chexbert_eq") 64 | all_diversity_rouge = load_diversity(args.input_dir, "rouge_scores") 65 | 66 | all_image_component_scores = load_component_scores(args.input_dir, "image_sentence_scores") 67 | all_normprob_component_scores = load_component_scores(args.input_dir, "normprob_sentence_scores") 68 | 69 | print("Reading rouge scores...") 70 | all_row_rouge = load_rouge(args.input_dir, "row_rouge_scores") 71 | all_row_generation = load_rouge(args.input_dir, "row_generation_idx_to_row_idx") 72 | all_row_reference = load_rouge(args.input_dir, "row_reference_idx_to_row_idx") 73 | 74 | if args.shuffle_file is not None: 75 | splits = np.load(args.shuffle_file) 76 | 77 | else: 78 | shuffle = np.random.permutation(len(all_labels)) 79 | splits = { 80 | 'train': shuffle[:args.num_train], 81 | 'val': shuffle[args.num_train:args.num_train + args.num_val], 82 | 'test': shuffle[args.num_train + args.num_val:], 83 | } 84 | 85 | os.makedirs(args.output_dir, exist_ok=True) 86 | np.savez(os.path.join(args.output_dir, 'splits.npz'), 87 | train=splits['train'], val=splits['val'], test=splits['test']) 88 | for split, idx in splits.items(): 89 | print(f"Saving {split}") 90 | dirname = os.path.join(args.output_dir, split) 91 | os.makedirs(dirname, exist_ok=True) 92 | np.save(os.path.join(dirname, 'idx.npy'), idx) 93 | np.save(os.path.join(dirname, 'labels.npy'), all_labels[idx]) 94 | np.save(os.path.join(dirname, 'probs.npy'), all_probs[idx]) 95 | np.save(os.path.join(dirname, 'image_report.npy'), all_image[idx]) 96 | np.save(os.path.join(dirname, 'report.npy'), all_text[idx]) 97 | np.save(os.path.join(dirname, 'gnn.npy'), all_gnn[idx]) 98 | np.save(os.path.join(dirname, 'diversity_chexbert.npy'), all_diversity_chexbert[idx]) 99 | np.save(os.path.join(dirname, 'diversity_rouge.npy'), all_diversity_rouge[idx]) 100 | save_jsonl(dirname, 'row_rouge.jsonl', all_row_rouge, idx) 101 | save_jsonl(dirname, 'row_generation.jsonl', all_row_generation, idx) 102 | save_jsonl(dirname, 'row_reference.jsonl', all_row_reference, idx) 103 | save_jsonl(dirname, 'image_component_scores.jsonl', all_image_component_scores, idx) 104 | save_jsonl(dirname, 'normprob_component_scores.jsonl', all_normprob_component_scores, idx) 105 | 106 | if __name__ == '__main__': 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument('--input_dir', type=str, default=INPUT_DIR) 109 | parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR) 110 | parser.add_argument('--shuffle_file', type=str, default=None) 111 | parser.add_argument('--loss_threshold', type=float, default=0.0) 112 | parser.add_argument('--num_train', type=int, default=2000) 113 | parser.add_argument('--num_val', type=int, default=2000) 114 | args = parser.parse_args() 115 | main(args) 116 | -------------------------------------------------------------------------------- /scripts/run_cnndm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | SPLIT='test' 4 | 5 | for score in 'probs'; do 6 | python scripts/run_trials.py \ 7 | --train_score_file "data/cnndm/train/${score}.npy" \ 8 | --train_label_file "data/cnndm/train/labels.npy" \ 9 | --train_similarity_file "data/cnndm/train/diversity.npy" \ 10 | --test_score_file "data/cnndm/${SPLIT}/${score}.npy" \ 11 | --test_label_file "data/cnndm/${SPLIT}/labels.npy" \ 12 | --test_similarity_file "data/triviaqa/${SPLIT}/diversity.npy" \ 13 | --output_file "results/cnndm/${SPLIT}/${score}_results.npz" 14 | done 15 | -------------------------------------------------------------------------------- /scripts/run_cnndm_components.sh: -------------------------------------------------------------------------------- 1 | python scripts/run_many_components.py \ 2 | --task cnndm \ 3 | --split test \ 4 | --scores nli probs random firstk \ 5 | --rouge_threshold 0.4 6 | 7 | -------------------------------------------------------------------------------- /scripts/run_component_trials.py: -------------------------------------------------------------------------------- 1 | """Script to generate outputs.""" 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | 7 | from clm import components 8 | from clm import utils 9 | 10 | 11 | def main(args): 12 | utils.set_seed(0) 13 | 14 | # Load dataset. 15 | train_scores = np.load(args.train_score_file) 16 | train_rouge = np.load(args.train_rouge_file) 17 | train_report_labels = np.load(args.train_report_labels_file) 18 | train_data = utils.ComponentDataset(scores=train_scores, rouge_with_refs=train_rouge, report_labels=train_report_labels) 19 | 20 | test_scores = np.load(args.test_score_file) 21 | test_rouge = np.load(args.test_rouge_file) 22 | test_report_labels = np.load(args.test_report_labels_file) 23 | test_data = utils.ComponentDataset(scores=test_scores, rouge_with_refs=test_rouge, report_labels=test_report_labels) 24 | 25 | epsilons, results = components.run_trials( 26 | train_data=train_data, 27 | test_data=test_data, 28 | p_cal=args.p_cal, 29 | num_trials=args.num_trials, 30 | filter_for_answerable=args.filter_for_answerable, 31 | rouge_threshold=args.rouge_threshold, 32 | scale_type=args.scale_type, 33 | ) 34 | 35 | os.makedirs(os.path.dirname(args.output_file), exist_ok=True) 36 | np.savez(args.output_file, epsilons=epsilons, results=results) 37 | 38 | 39 | if __name__ == '__main__': 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--train_score_file', type=str) 42 | parser.add_argument('--train_rouge_file', type=str) 43 | parser.add_argument('--train_report_labels_file', type=str) 44 | 45 | parser.add_argument('--test_score_file', type=str) 46 | parser.add_argument('--test_rouge_file', type=str) 47 | parser.add_argument('--test_report_labels_file', type=str) 48 | 49 | parser.add_argument('--output_file', type=str) 50 | parser.add_argument('--p_cal', type=float, default=0.5) 51 | parser.add_argument('--delta', type=float, default=0.05) 52 | parser.add_argument('--num_trials', type=int, default=100) 53 | 54 | parser.add_argument('--filter_for_answerable', type=bool, default=False) 55 | parser.add_argument('--rouge_threshold', type=float, default=0.4) 56 | parser.add_argument('--scale_type', type=str, default='none') 57 | 58 | args = parser.parse_args() 59 | main(args) 60 | -------------------------------------------------------------------------------- /scripts/run_cxr.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | SPLIT='test' 4 | 5 | for score in 'probs'; do 6 | python scripts/run_trials.py \ 7 | --train_score_file "data/cxr/train/${score}.npy" \ 8 | --train_label_file "data/cxr/train/labels.npy" \ 9 | --train_similarity_file "data/cxr/train/diversity_rouge.npy" \ 10 | --test_score_file "data/cxr/${SPLIT}/${score}.npy" \ 11 | --test_label_file "data/cxr/${SPLIT}/labels.npy" \ 12 | --test_similarity_file "data/cxr/${SPLIT}/diversity_rouge.npy" \ 13 | --output_file "results/cxr/${SPLIT}/${score}_results.npz" 14 | done 15 | -------------------------------------------------------------------------------- /scripts/run_cxr_components.sh: -------------------------------------------------------------------------------- 1 | python scripts/run_many_components.py \ 2 | --task cxr \ 3 | --split test \ 4 | --scores image probs random firstk \ 5 | --rouge_threshold 0.4 6 | 7 | -------------------------------------------------------------------------------- /scripts/run_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from tqdm.auto import tqdm 5 | import matplotlib.pyplot as plt 6 | 7 | from clm.components import compute_values_for_C_inner, compute_values_for_C_inner_basic 8 | 9 | base = "data/cxr/val" 10 | 11 | rouge_label = np.load(os.path.join(base, "components", "rouge_label.npy")) 12 | rouge_with_refs = np.load(os.path.join(base, "components", "rouge_with_refs.npy")) 13 | image_preds = np.load(os.path.join(base, "components", "image_preds.npy")) 14 | mask = np.load(os.path.join(base, "components", "mask.npy")) 15 | oracle_size = np.load(os.path.join(base, "components", "oracle_size.npy")) 16 | 17 | # New version 18 | 19 | results = compute_values_for_C_inner_basic(image_preds, rouge_label, mask, oracle_size, rouge_threshold=0.4) 20 | 21 | fig, ax = plt.subplots(1, 1) 22 | ax.plot(results['L_avg'], results['fake_recall'], marker=".") 23 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 24 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| / |C^*|]$") 25 | ax.set_xlim(0, 0.9) 26 | ax.set_ylim(0, 2) 27 | ax.set_title(f"C_inner (fake recall), with rougeL_threshold = 0.4") 28 | plt.show() 29 | 30 | # Old version 31 | 32 | results = compute_values_for_C_inner(image_preds, rouge_with_refs, rouge_threshold=0.4) 33 | 34 | fig, ax = plt.subplots(1, 1) 35 | ax.plot(results['L_avg'], results['fake_recall'], marker=".") 36 | ax.set_xlabel("$1 - \mathbb{P}(C_{inner} \subset C^*)$") 37 | ax.set_ylabel("$\mathbb{E}[|C_{inner}\cap C^*| / |C^*|]$") 38 | ax.set_xlim(0, 0.9) 39 | ax.set_ylim(0, 2) 40 | ax.set_title(f"C_inner (fake recall), with rougeL_threshold = 0.4") 41 | plt.show() 42 | -------------------------------------------------------------------------------- /scripts/run_many_components.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import os 3 | import multiprocessing 4 | import subprocess 5 | import numpy as np 6 | import datetime 7 | import argparse 8 | 9 | NUM_TRIALS = 100 10 | TEMP_DIR = '/tmp/clm/' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 11 | 12 | def get_filename(task, score, split, filter_for_answerable, rouge_threshold, idx_trial): 13 | return os.path.join(TEMP_DIR, f'{task}_{score}_{split}_{filter_for_answerable}_{rouge_threshold}_{idx_trial}.npz') 14 | 15 | def run_component_trials(task, score, split, filter_for_answerable, rouge_threshold, idx_trial, batch_size): 16 | filename = get_filename(task, score, split, filter_for_answerable, rouge_threshold, idx_trial) 17 | 18 | command = [ 19 | 'python', 'scripts/run_component_trials.py', 20 | '--train_score_file', f'data/{task}/train/components/{score}_preds.npy', 21 | '--train_rouge_file', f'data/{task}/train/components/rouge_with_refs.npy', 22 | '--train_report_labels_file', f'data/{task}/train/labels.npy', 23 | '--test_score_file', f'data/{task}/{split}/components/{score}_preds.npy', 24 | '--test_rouge_file', f'data/{task}/{split}/components/rouge_with_refs.npy', 25 | '--test_report_labels_file', f'data/{task}/{split}/labels.npy', 26 | '--rouge_threshold', str(rouge_threshold), 27 | '--num_trials', str(batch_size), 28 | '--filter_for_answerable', str(filter_for_answerable), 29 | '--output_file', filename, 30 | ] 31 | subprocess.run(command) 32 | 33 | if __name__ == '__main__': 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('--task', type=str, default='cxr') 36 | parser.add_argument('--split', type=str, default='test') 37 | parser.add_argument('--scores', nargs='+', default=['image', 'probs', 'random', 'firstk']) 38 | parser.add_argument('--filter_for_answerable', nargs='+', type=int, default=[False, True]) 39 | parser.add_argument('--rouge_threshold', type=float, default=0.4) 40 | parser.add_argument('--batch_size', type=int, default=10) 41 | parser.add_argument('--num_trials', type=int, default=NUM_TRIALS) 42 | parser.add_argument('--results_base', type=str, default="results") 43 | args = parser.parse_args() 44 | 45 | num_runs = args.num_trials // args.batch_size 46 | 47 | pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 48 | for score in args.scores: 49 | for filter_flag in args.filter_for_answerable: 50 | for idx_trial in range(num_runs): 51 | pool.apply_async(run_component_trials, args=(args.task, score, args.split, filter_flag, args.rouge_threshold, idx_trial, args.batch_size)) 52 | 53 | pool.close() 54 | pool.join() 55 | 56 | for score in args.scores: 57 | for filter_flag in args.filter_for_answerable: 58 | result_dir = f'{args.results_base}/{args.task}/{args.split}/filter_{filter_flag}/rouge_{args.rouge_threshold}/' 59 | os.makedirs(result_dir, exist_ok=True) 60 | 61 | combined = defaultdict(list) 62 | stacked_results = {} 63 | epsilons = None 64 | 65 | for idx_trial in range(num_runs): 66 | filename = get_filename(args.task, score, args.split, filter_flag, args.rouge_threshold, idx_trial) 67 | output = np.load(filename, allow_pickle=True) 68 | results = output['results'].item() 69 | epsilons = output['epsilons'] 70 | 71 | for k, v in results.items(): 72 | combined[k].append(v) 73 | 74 | for k, v in combined.items(): 75 | stacked_results[k] = np.concatenate(v, axis=0) 76 | 77 | np.savez(f'{result_dir}/{score}_components.npz', epsilons=epsilons, results=stacked_results) 78 | -------------------------------------------------------------------------------- /scripts/run_trials.py: -------------------------------------------------------------------------------- 1 | """Script to generate outputs.""" 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | 7 | from clm import uncertainty 8 | from clm import utils 9 | 10 | 11 | def main(args): 12 | utils.set_seed(0) 13 | 14 | # Load dataset. 15 | train_scores = np.load(args.train_score_file) 16 | train_labels = np.load(args.train_label_file) 17 | train_similarity = np.load(args.train_similarity_file) 18 | train_data = utils.Dataset(train_scores, train_similarity, train_labels) 19 | 20 | test_scores = np.load(args.test_score_file) 21 | test_labels = np.load(args.test_label_file) 22 | test_similarity = np.load(args.test_similarity_file) 23 | test_data = utils.Dataset(test_scores, test_similarity, test_labels) 24 | 25 | methods, epsilons, results = uncertainty.run_trials( 26 | train_data=train_data, 27 | test_data=test_data, 28 | p_cal=args.p_cal, 29 | num_trials=args.num_trials, 30 | num_processes=args.num_processes) 31 | 32 | os.makedirs(os.path.dirname(args.output_file), exist_ok=True) 33 | np.savez( 34 | args.output_file, methods=methods, epsilons=epsilons, results=results) 35 | 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('--train_score_file', type=str) 40 | parser.add_argument('--train_label_file', type=str) 41 | parser.add_argument('--train_similarity_file', type=str) 42 | parser.add_argument('--test_score_file', type=str) 43 | parser.add_argument('--test_label_file', type=str) 44 | parser.add_argument('--test_similarity_file', type=str) 45 | parser.add_argument('--output_file', type=str) 46 | parser.add_argument('--p_cal', type=float, default=0.5) 47 | parser.add_argument('--delta', type=float, default=0.05) 48 | parser.add_argument('--num_trials', type=int, default=100) 49 | parser.add_argument('--num_processes', type=int, default=40) 50 | args = parser.parse_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /scripts/run_triviaqa.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | SPLIT='test' 4 | 5 | for score in 'probs'; do 6 | python scripts/run_trials.py \ 7 | --train_score_file "data/triviaqa/train/${score}.npy" \ 8 | --train_label_file "data/triviaqa/train/labels.npy" \ 9 | --train_similarity_file "data/triviaqa/train/diversity.npy" \ 10 | --test_score_file "data/triviaqa/${SPLIT}/${score}.npy" \ 11 | --test_label_file "data/triviaqa/${SPLIT}/labels.npy" \ 12 | --test_similarity_file "data/triviaqa/${SPLIT}/diversity.npy" \ 13 | --output_file "results/triviaqa/${SPLIT}/${score}_results.npz" 14 | done 15 | -------------------------------------------------------------------------------- /scripts/triviaqa_data.py: -------------------------------------------------------------------------------- 1 | """Split TriviaQA data.""" 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | 7 | INPUT_DIR = '/Mounts/rbg-storage1/users/quach/outputs/uncertainty/triviaqa' 8 | OUTPUT_DIR = 'data/triviaqa' 9 | 10 | 11 | def main(args): 12 | np.random.seed(0) 13 | all_labels = 1 - np.load(os.path.join(args.input_dir, 'all_losses.npy')) 14 | all_probs = np.load(os.path.join(args.input_dir, 'all_prob_scores.npy')) 15 | all_self_eval = np.load(os.path.join(args.input_dir, 'all_self_eval.npy')) 16 | diversity = np.load(os.path.join(args.input_dir, 'diversity.npy')) 17 | 18 | shuffle = np.random.permutation(len(all_labels)) 19 | splits = { 20 | 'train': shuffle[:args.num_train], 21 | 'val': shuffle[args.num_train:args.num_train + args.num_val], 22 | 'test': shuffle[args.num_train + args.num_val:], 23 | } 24 | 25 | os.makedirs(args.output_dir, exist_ok=True) 26 | np.savez(os.path.join(args.output_dir, 'splits.npz'), 27 | train=splits['train'], val=splits['val'], test=splits['test']) 28 | for split, idx in splits.items(): 29 | dirname = os.path.join(args.output_dir, split) 30 | os.makedirs(dirname, exist_ok=True) 31 | np.save(os.path.join(dirname, 'idx.npy'), idx) 32 | np.save(os.path.join(dirname, 'labels.npy'), all_labels[idx]) 33 | np.save(os.path.join(dirname, 'probs.npy'), all_probs[idx]) 34 | np.save(os.path.join(dirname, 'self_eval.npy'), all_self_eval[idx]) 35 | np.save(os.path.join(dirname, 'diversity.npy'), diversity[idx]) 36 | 37 | 38 | if __name__ == '__main__': 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('--input_dir', type=str, default=INPUT_DIR) 41 | parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR) 42 | parser.add_argument('--num_train', type=int, default=2000) 43 | parser.add_argument('--num_val', type=int, default=2000) 44 | args = parser.parse_args() 45 | main(args) 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='clm', 5 | python_requires='>=3.7', 6 | packages=find_packages(exclude=('data', 'results')), 7 | ) 8 | --------------------------------------------------------------------------------