├── figures ├── trivia_qa │ ├── 30b_results.pdf │ ├── accuracy_versus_model_size.pdf │ ├── ln_predictive_entropy_auroc.pdf │ ├── temperature_comparisons_trivia_qa.pdf │ ├── ln_predictive_entropy_auroc_triviaqa.pdf │ └── ln_predictive_entropy_auroc_triviaqa_with_margin.pdf ├── auroc_as_function_of_n_samples.pdf └── coqa │ ├── accuracy_versus_model_size.pdf │ ├── temperature_comparisons_coqa.pdf │ ├── ln_predictive_entropy_auroc_coqa.pdf │ └── ln_predictive_entropy_auroc_coqa_with_margin.pdf ├── code ├── run_pipeline.sh ├── config.py ├── clean_generated_strings.py ├── parse_coqa.py ├── parse_triviaqa.py ├── get_prompting_based_uncertainty.py ├── environment.yml ├── get_semantic_similarities.py ├── get_likelihoods.py ├── compute_confidence_measure.py ├── generate.py └── analyze_results.py ├── LICENSE ├── .gitignore └── README.md /figures/trivia_qa/30b_results.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/30b_results.pdf -------------------------------------------------------------------------------- /figures/auroc_as_function_of_n_samples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/auroc_as_function_of_n_samples.pdf -------------------------------------------------------------------------------- /figures/coqa/accuracy_versus_model_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/accuracy_versus_model_size.pdf -------------------------------------------------------------------------------- /figures/coqa/temperature_comparisons_coqa.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/temperature_comparisons_coqa.pdf -------------------------------------------------------------------------------- /figures/trivia_qa/accuracy_versus_model_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/accuracy_versus_model_size.pdf -------------------------------------------------------------------------------- /figures/coqa/ln_predictive_entropy_auroc_coqa.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/ln_predictive_entropy_auroc_coqa.pdf -------------------------------------------------------------------------------- /figures/trivia_qa/ln_predictive_entropy_auroc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/ln_predictive_entropy_auroc.pdf -------------------------------------------------------------------------------- /figures/trivia_qa/temperature_comparisons_trivia_qa.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/temperature_comparisons_trivia_qa.pdf -------------------------------------------------------------------------------- /figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa.pdf -------------------------------------------------------------------------------- /figures/coqa/ln_predictive_entropy_auroc_coqa_with_margin.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/ln_predictive_entropy_auroc_coqa_with_margin.pdf -------------------------------------------------------------------------------- /figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa_with_margin.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa_with_margin.pdf -------------------------------------------------------------------------------- /code/run_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --cpus-per-task=24 3 | #SBATCH --gres=gpu:a100:1 4 | #SBATCH --job-name="nlg_uncertainty" 5 | `` 6 | 7 | 8 | run_id=`python -c "import wandb; run_id = wandb.util.generate_id(); wandb.init(project='nlg_uncertainty', id=run_id); print(run_id)"` 9 | 10 | model='opt-350m' 11 | srun python generate.py --num_generations_per_prompt='5' --model=$model --fraction_of_data_to_use='0.02' --run_id=$run_id --temperature='0.5' --num_beams='1' --top_p='1.0'; srun python clean_generated_strings.py --generation_model=$model --run_id=$run_id; python get_semantic_similarities.py --generation_model=$model --run_id=$run_id; python get_likelihoods.py --evaluation_model=$model --generation_model=$model --run_id=$run_id; srun python get_prompting_based_uncertainty.py --run_id_for_few_shot_prompt=$run_id --run_id_for_evaluation=$run_id; python compute_confidence_measure.py --generation_model=$model --evaluation_model=$model --run_id=$run_id 12 | 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Lorenz Kuhn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/config.py: -------------------------------------------------------------------------------- 1 | device_map = { 2 | 'model.decoder.embed_tokens': 0, 3 | 'model.decoder.embed_positions': 0, 4 | 'model.decoder.layers.0': 0, 5 | 'model.decoder.layers.1': 0, 6 | 'model.decoder.layers.2': 0, 7 | 'model.decoder.layers.3': 0, 8 | 'model.decoder.layers.4': 0, 9 | 'model.decoder.layers.5': 0, 10 | 'model.decoder.layers.6': 0, 11 | 'model.decoder.layers.7': 0, 12 | 'model.decoder.layers.8': 0, 13 | 'model.decoder.layers.9': 0, 14 | 'model.decoder.layers.10': 0, 15 | 'model.decoder.layers.11': 0, 16 | 'model.decoder.layers.12': 0, 17 | 'model.decoder.layers.13': 0, 18 | 'model.decoder.layers.14': 0, 19 | 'model.decoder.layers.15': 0, 20 | 'model.decoder.layers.16': 0, 21 | 'model.decoder.layers.17': 0, 22 | 'model.decoder.layers.18': 0, 23 | 'model.decoder.layers.19': 0, 24 | 'model.decoder.layers.20': 0, 25 | 'model.decoder.layers.21': 0, 26 | 'model.decoder.layers.22': 0, 27 | 'model.decoder.layers.23': 0, 28 | 'model.decoder.layers.24': 0, 29 | 'model.decoder.layers.25': 1, 30 | 'model.decoder.layers.26': 1, 31 | 'model.decoder.layers.27': 1, 32 | 'model.decoder.layers.28': 1, 33 | 'model.decoder.layers.29': 1, 34 | 'model.decoder.layers.30': 1, 35 | 'model.decoder.layers.31': 1, 36 | 'model.decoder.layers.32': 1, 37 | 'model.decoder.layers.33': 1, 38 | 'model.decoder.layers.34': 1, 39 | 'model.decoder.layers.35': 1, 40 | 'model.decoder.layers.36': 1, 41 | 'model.decoder.layers.37': 1, 42 | 'model.decoder.layers.38': 1, 43 | 'model.decoder.layers.39': 1, 44 | 'model.decoder.layers.40': 1, 45 | 'model.decoder.layers.41': 1, 46 | 'model.decoder.layers.42': 1, 47 | 'model.decoder.layers.43': 1, 48 | 'model.decoder.layers.44': 1, 49 | 'model.decoder.layers.45': 1, 50 | 'model.decoder.layers.46': 1, 51 | 'model.decoder.layers.47': 1, 52 | 'model.decoder.layers.48': 1, 53 | 'model.decoder.final_layer_norm': 1, 54 | 'lm_head': 1 55 | } 56 | 57 | data_dir = '' 58 | hf_datasets_cache = '' 59 | output_dir = '' 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | *.out 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /code/clean_generated_strings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import random 5 | 6 | import numpy as np 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer 10 | 11 | import config 12 | import wandb 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--generation_model', type=str, default='opt-350m') 16 | parser.add_argument('--run_id', type=str, default='run_1') 17 | args = parser.parse_args() 18 | 19 | device = 'cuda' 20 | 21 | # Set a seed value 22 | seed_value = 10 23 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value 24 | 25 | os.environ['PYTHONHASHSEED'] = str(seed_value) 26 | # 2. Set `python` built-in pseudo-random generator at a fixed value 27 | 28 | random.seed(seed_value) 29 | # 3. Set `numpy` pseudo-random generator at a fixed value 30 | 31 | np.random.seed(seed_value) 32 | 33 | #Fix torch random seed 34 | torch.manual_seed(seed_value) 35 | 36 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache 37 | 38 | generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir) 39 | 40 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow') 41 | 42 | run_name = wandb.run.name 43 | 44 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir) 45 | 46 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'rb') as infile: 47 | sequences = pickle.load(infile) 48 | 49 | cleaned_sequences = [] 50 | 51 | for sample in tqdm(sequences): 52 | cleaned_generations = torch.ones_like(sample['generations']) 53 | question = sample['question'] 54 | generated_texts = sample['generated_texts'] 55 | cleaned_generated_texts = [] 56 | 57 | max_len_of_generations = cleaned_generations.shape[-1] 58 | 59 | strings_to_filter_on = [ 60 | '.', '\n', 'Q:', 'A:', 'question:', 'answer:', 'Question:', 'Answer:', 'Questions:', 'questions:', 'QUESTION:', 61 | 'ANSWER:' 62 | ] 63 | 64 | for i, generated_text in enumerate(generated_texts): 65 | for string in strings_to_filter_on: 66 | if string in generated_text: 67 | generated_text = generated_text.split(string)[0] 68 | cleaned_generated_texts.append(generated_text) 69 | clean_ids = torch.cat( 70 | [sample['prompt'].to(device), 71 | torch.tensor(tokenizer(generated_text)['input_ids'][1:], device=device)]) 72 | cleaned_generations[i, :min(len(clean_ids), max_len_of_generations)] = clean_ids[:max_len_of_generations] 73 | 74 | sample['cleaned_generated_texts'] = cleaned_generated_texts 75 | sample['cleaned_generations'] = cleaned_generations 76 | cleaned_sequences.append(sample) 77 | 78 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'wb') as outfile: 79 | pickle.dump(cleaned_sequences, outfile) 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *** 2 | 3 | # June 2024 — [A new and improved implementation of Semantic Uncertainty is available]([url](https://github.com/jlko/semantic_uncertainty)), this repo is deprecated 4 | We're excited to share a [new implementation of semantic uncertainty](https://github.com/jlko/semantic_uncertainty) which corresponds to our [2024 Nature paper](https://www.nature.com/articles/s41586-024-07421-0) _Detecting Hallucinations in Large Language Models Using Semantic Entropy_. Please use the new and improved version, we are deprecating this repository. Thank you for your interest! 5 | 6 | This repository contains the code for our 2023 ICLR paper _Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation_. 7 | 8 | *** 9 | 10 | 11 | ### Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation 12 | 13 | ![image](https://user-images.githubusercontent.com/9898136/223775961-7f9525fc-9674-4bf4-b15f-d49487daddca.png) 14 | 15 | # Overview 16 | 17 | This repository contains the code used in Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation ([arXiv](https://arxiv.org/abs/2302.09664)) 18 | 19 | `run_pipeline.sh` is a slurm batch script that executes all steps of our pipeline. `sbatch run_pipeline.sh` submits the batch script. 20 | 21 | ### Preprocessing & Config 22 | 23 | `parse_triviaqa.py` and `parse_coqa.py` load TriviaQA and CoQA from HuggingFace, tokenize it and store the data sets. These scripts only have to be run once. 24 | 25 | You'll also have to set the paths where you would like to store intermediate and final results of the pipeline in `config.py`. 26 | 27 | The `environment.yml` lists the dependencies of the conda environment we used for our experiments. 28 | 29 | ### Generating answers and computing uncertainty measures 30 | 31 | The components of our pipeline are: 32 | 33 | * `generate.py` generates a number of answers for a subset of questions of a given data set. This step also evaluates the question-answering accuracy of the generated answers. 34 | * `clean_generations.py` post-processes the generations from the first step, mainly by removing any unwanted trailing text, e.g. in cases where the model first gives the answer to the given question and then generates an additional question. 35 | * `get_semantic_similarities.py` identifies semantic clusters in the generated set of answers from the previous step. 36 | * `get_prompting_based_uncertainty.py` computes the p(True) baseline. 37 | * `compute_likelihoods.py` computes the likelihoods of the generated answers under the generating model. 38 | * `compute_confidence_measure.py` computes a range of different conficence/uncertainty measures such as the semantice entropy predictive entropy, lexical similarity, and p(True). 39 | 40 | ### Analyzing results 41 | 42 | After running the pipeline, use `analyze_result.py` to compute performance metrics, such as the AUROC. 43 | 44 | ### Hardware requirements 45 | 46 | Most model runs should run with at most 40GB of GPU memory. An exception are the experiments on OPT-30B which we run on two 80GB A100s. 47 | 48 | ### Dependencies 49 | 50 | Our implemenetation uses PyTorch and HuggingFace. We use `wandb` to track our runs. environment 51 | -------------------------------------------------------------------------------- /code/parse_coqa.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import evaluate 4 | import pandas as pd 5 | import torch 6 | from datasets import Dataset 7 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 8 | 9 | import config 10 | 11 | with open(f'{config.data_dir}/coqa-dev-v1.0.json', 'r') as infile: 12 | data = json.load(infile)['data'] 13 | 14 | rouge = evaluate.load('rouge') 15 | 16 | tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli") 17 | 18 | model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large-mnli").cuda() 19 | 20 | dataset = {} 21 | 22 | dataset['story'] = [] 23 | dataset['question'] = [] 24 | dataset['answer'] = [] 25 | dataset['additional_answers'] = [] 26 | dataset['rouge1'] = [] 27 | dataset['rouge2'] = [] 28 | dataset['rougeL'] = [] 29 | dataset['semantic_variability'] = [] 30 | dataset['id'] = [] 31 | 32 | for sample_id, sample in enumerate(data): 33 | story = sample['story'] 34 | questions = sample['questions'] 35 | answers = sample['answers'] 36 | additional_answers = sample['additional_answers'] 37 | for question_index, question in enumerate(questions): 38 | dataset['story'].append(story) 39 | dataset['question'].append(question['input_text']) 40 | dataset['answer'].append({ 41 | 'text': answers[question_index]['input_text'], 42 | 'answer_start': answers[question_index]['span_start'] 43 | }) 44 | dataset['id'].append(sample['id'] + '_' + str(question_index)) 45 | additional_answers_list = [] 46 | 47 | for i in range(3): 48 | additional_answers_list.append(additional_answers[str(i)][question_index]['input_text']) 49 | 50 | dataset['additional_answers'].append(additional_answers_list) 51 | story = story + ' Q: ' + question['input_text'] + ' A: ' + answers[question_index]['input_text'] 52 | if not story[-1] == '.': 53 | story = story + '.' 54 | all_answers = [answers[question_index]['input_text']] + additional_answers_list 55 | 56 | answer_list_1 = [] 57 | answer_list_2 = [] 58 | has_semantically_different_answers = False 59 | inputs = [] 60 | 61 | # This computes the syntactic similarity across the reference answers 62 | for i, reference_answer in enumerate(all_answers): 63 | for j in range(4): 64 | if i != j: 65 | answer_list_1.append(all_answers[i]) 66 | answer_list_2.append(all_answers[j]) 67 | 68 | qa_1 = question['input_text'] + ' ' + all_answers[i] 69 | qa_2 = question['input_text'] + ' ' + all_answers[j] 70 | 71 | input = qa_1 + ' [SEP] ' + qa_2 72 | 73 | inputs.append(input) 74 | #print(encoded_input) 75 | 76 | encoded_input = tokenizer.batch_encode_plus(inputs, padding=True) 77 | 78 | prediction = model(torch.tensor(encoded_input['input_ids'], device='cuda'))['logits'] 79 | 80 | predicted_label = torch.argmax(prediction, dim=1) 81 | if 0 in predicted_label: 82 | has_semantically_different_answers = True 83 | 84 | dataset['semantic_variability'].append(has_semantically_different_answers) 85 | 86 | results = rouge.compute(predictions=answer_list_1, references=answer_list_2) 87 | dataset['rouge1'].append(results['rouge1'].mid.fmeasure) 88 | dataset['rouge2'].append(results['rouge2'].mid.fmeasure) 89 | dataset['rougeL'].append(results['rougeL'].mid.fmeasure) 90 | 91 | dataset_df = pd.DataFrame.from_dict(dataset) 92 | 93 | dataset = Dataset.from_pandas(dataset_df) 94 | 95 | dataset.save_to_disk(f'{config.data_dir}/coqa_dataset') 96 | -------------------------------------------------------------------------------- /code/parse_triviaqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | import pickle 4 | 5 | import accelerate 6 | import datasets 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | import config 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--type_of_question', type=str) 14 | parser.add_argument('--num_generations_per_prompt', type=int, default=5) 15 | parser.add_argument('--fraction_of_data_to_use', type=float, default=0.9) 16 | parser.add_argument('--model', type=str, default='opt-350m') 17 | parser.add_argument('--run_id', type=str, default='run_1') 18 | parser.add_argument('--temperature', type=float, default='1.0') 19 | parser.add_argument('--num_beams', type=int, default='5') 20 | parser.add_argument('--decoding_method', type=str, default='beam_search') 21 | parser.add_argument('--top_p', type=float, default=1.0) 22 | args = parser.parse_args() 23 | 24 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{args.model}", 25 | torch_dtype=torch.float16, 26 | cache_dir=config.data_dir).cuda() 27 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir) 28 | 29 | if args.model == 'opt-30b': 30 | accelerate.dispatch_model(model, device_map=config.device_map) 31 | 32 | seed_value = 10 33 | 34 | if not pathlib.Path(f'{config.data_dir}/trivia_qa').exists(): 35 | 36 | print('Preprocessing dataset') 37 | val_data = datasets.load_dataset("trivia_qa", "rc.nocontext", split="validation") 38 | train_data = datasets.load_dataset("trivia_qa", "rc.nocontext", split="train") 39 | data_for_few_shot_prompt = train_data.select(range(0, 10)) 40 | 41 | few_shot_prompt = 'This is a bot that correctly answers questions. \n' 42 | for sample in data_for_few_shot_prompt: 43 | few_shot_prompt += 'Question: ' + sample['question'] + ' Answer: ' + sample['answer']['value'] + ' ' 44 | 45 | batch_size = 4 # change to 16 for full training 46 | encoder_max_length = 1024 47 | decoder_max_length = 128 48 | 49 | def process_data_to_model_inputs(batch): 50 | # tokenize the inputs and labels 51 | answers = [answer["value"] for answer in batch["answer"]] 52 | 53 | batch_with_prompt = [few_shot_prompt + "Question: " + question + " Answer:" for question in batch["question"]] 54 | inputs = tokenizer(batch_with_prompt, padding=False, truncation=False) 55 | outputs = tokenizer(answers, padding=False, truncation=False) 56 | 57 | batch["input_ids"] = inputs.input_ids 58 | batch["attention_mask"] = inputs.attention_mask 59 | batch["decoder_input_ids"] = outputs.input_ids 60 | batch["decoder_attention_mask"] = outputs.attention_mask 61 | batch["labels"] = outputs.input_ids.copy() 62 | batch['answer'] = answers 63 | 64 | # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 65 | # We have to make sure that the PAD token is ignored 66 | batch["labels"] = [ 67 | [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"] 68 | ] 69 | 70 | return batch 71 | 72 | val_data = val_data.map(process_data_to_model_inputs, 73 | batched=True, 74 | batch_size=batch_size, 75 | remove_columns=["search_results", "question_source", "entity_pages"]) 76 | val_data.set_format( 77 | type="torch", 78 | columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"], 79 | output_all_columns=True) 80 | 81 | val_data.save_to_disk(f'{config.data_dir}/trivia_qa') 82 | else: 83 | 84 | val_data = datasets.load_from_disk(f'{config.data_dir}/trivia_qa') 85 | -------------------------------------------------------------------------------- /code/get_prompting_based_uncertainty.py: -------------------------------------------------------------------------------- 1 | # Read generation results 2 | import argparse 3 | import os 4 | import pickle 5 | import random 6 | 7 | import accelerate 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import seaborn as sns 11 | import sklearn 12 | import torch 13 | from tqdm import tqdm 14 | from transformers import AutoModelForCausalLM, AutoTokenizer 15 | 16 | import config 17 | #sns.color_palette("pastel") 18 | import wandb 19 | from config import device_map 20 | 21 | # Set a seed value 22 | seed_value = 10 23 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value 24 | 25 | os.environ['PYTHONHASHSEED'] = str(seed_value) 26 | # 2. Set `python` built-in pseudo-random generator at a fixed value 27 | 28 | random.seed(seed_value) 29 | # 3. Set `numpy` pseudo-random generator at a fixed value 30 | 31 | np.random.seed(seed_value) 32 | 33 | device = torch.device('cuda') 34 | 35 | #Fix torch random seed 36 | torch.manual_seed(seed_value) 37 | 38 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache 39 | 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--generation_model', type=str, default='opt-1.3b') 42 | parser.add_argument('--run_id_for_few_shot_prompt', type=str, default='run_1') 43 | parser.add_argument('--run_id_for_evaluation', type=str, default='run_1') 44 | args = parser.parse_args() 45 | 46 | wandb.init(project='nlg_uncertainty', id=args.run_id_for_few_shot_prompt, config=args, resume='allow') 47 | model_name = wandb.config.model 48 | 49 | generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir) 50 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{model_name}", 51 | torch_dtype=torch.float16, 52 | cache_dir=config.data_dir).cuda() 53 | 54 | if model_name == 'opt-30b': 55 | accelerate.dispatch_model(model, device_map=device_map) 56 | print(model.hf_device_map) 57 | device = torch.device('cuda:1') 58 | 59 | run_name = wandb.run.name 60 | 61 | with open(f'{config.output_dir} /{run_name}/{model_name}_generations.pkl', 'rb') as infile: 62 | sequences_for_few_shot_prompt = pickle.load(infile) 63 | 64 | wandb.finish() 65 | 66 | # Build few shot prompt 67 | 68 | subset_of_sequences_for_few_shot_prompt = sequences_for_few_shot_prompt[-10:] 69 | number_of_few_shot_samples = 5 70 | 71 | prompt_template = 'Question: {} \n Here are some ideas that were brainstormed:{}\n Possible answer:{}\n Is the possible answer:\n (A) True\n (B) False\n The possible answer is:' 72 | few_shot_promopt = '' 73 | for sequence in subset_of_sequences_for_few_shot_prompt: 74 | question = sequence['question'] 75 | question = question.split('Question: ')[-1].split('Answer: ')[0] 76 | prompt = sequence['prompt'] 77 | generated_texts = '\n'.join(sequence['cleaned_generated_texts'][:number_of_few_shot_samples]) 78 | 79 | most_likely_answer = sequence['most_likely_generation'] 80 | correct = ' True' if sequence['rougeL_to_target'] > 0.3 else ' False' 81 | few_shot_promopt += prompt_template.format(question, generated_texts, most_likely_answer) + correct + '\n' 82 | 83 | # Build prompt for question 84 | labels_across_datasets = [] 85 | p_trues_across_datasets = [] 86 | 87 | n_samples_to_use = 2000 88 | 89 | with torch.no_grad(): 90 | 91 | aurocs = [] 92 | p_trues = [] 93 | corrects = [] 94 | for sequence in tqdm(sequences_for_few_shot_prompt[:n_samples_to_use]): 95 | 96 | question = sequence['question'] 97 | if 'Question: ' in question: 98 | question = question.split('Question: ')[-1].split('Answer: ')[0] 99 | else: 100 | question = question.split('Q: ')[-1].split('A: ')[0] 101 | 102 | generated_texts = '\n'.join(sequence['cleaned_generated_texts'][:number_of_few_shot_samples]) 103 | most_likely_answer = sequence['most_likely_generation'] 104 | correct = 1.0 if sequence['rougeL_to_target'] > 0.3 else 0.0 105 | base_prompt = prompt_template.format(question, generated_texts, most_likely_answer) 106 | prompt_true = few_shot_promopt + prompt_template.format(question, generated_texts, most_likely_answer) + ' True' 107 | 108 | # This computation of the negative log likelihoods follows this tutorial: https://huggingface.co/docs/transformers/perplexity 109 | tokenized_base_prompt = generation_tokenizer(base_prompt)['input_ids'] 110 | tokenized_prompt_true = torch.tensor(generation_tokenizer(prompt_true)['input_ids'], device=device) 111 | 112 | target_ids_true = tokenized_prompt_true.clone() 113 | target_ids_true[:len(tokenized_base_prompt)] = -100 114 | 115 | model_output_true = model(torch.reshape(tokenized_prompt_true, (1, -1)), labels=target_ids_true) 116 | loss_true = model_output_true.loss 117 | 118 | p_trues.append(loss_true.item()) 119 | corrects.append(correct) 120 | 121 | labels_across_datasets += corrects 122 | p_trues_across_datasets += p_trues 123 | 124 | p_true_auroc = sklearn.metrics.roc_auc_score(1 - torch.tensor(corrects), torch.tensor(p_trues)) 125 | 126 | # Store p_true aurocs in a pickle file 127 | with open(f'{config.output_dir}/{run_name}/{model_name}_p_true_aurocs.pkl', 'wb') as outfile: 128 | pickle.dump(p_true_auroc, outfile) 129 | -------------------------------------------------------------------------------- /code/environment.yml: -------------------------------------------------------------------------------- 1 | name: unanswerable 2 | channels: 3 | - pytorch 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=5.1=1_gnu 9 | - blas=1.0=mkl 10 | - brotlipy=0.7.0=py310h7f8727e_1002 11 | - bzip2=1.0.8=h7b6447c_0 12 | - ca-certificates=2022.4.26=h06a4308_0 13 | - cffi=1.15.0=py310hd667e15_1 14 | - cryptography=37.0.1=py310h9ce1e76_0 15 | - cudatoolkit=11.3.1=h2bc3f7f_2 16 | - debugpy=1.5.1=py310h295c915_0 17 | - entrypoints=0.4=py310h06a4308_0 18 | - ffmpeg=4.3=hf484d3e_0 19 | - freetype=2.11.0=h70c0345_0 20 | - giflib=5.2.1=h7b6447c_0 21 | - gmp=6.2.1=h295c915_3 22 | - gnutls=3.6.15=he1e5248_0 23 | - intel-openmp=2021.4.0=h06a4308_3561 24 | - ipykernel=6.9.1=py310h06a4308_0 25 | - ipython=8.3.0=py310h06a4308_0 26 | - jedi=0.18.1=py310h06a4308_1 27 | - jpeg=9e=h7f8727e_0 28 | - jupyter_client=7.2.2=py310h06a4308_0 29 | - jupyter_core=4.10.0=py310h06a4308_0 30 | - lame=3.100=h7b6447c_0 31 | - lcms2=2.12=h3be6417_0 32 | - ld_impl_linux-64=2.38=h1181459_1 33 | - libffi=3.3=he6710b0_2 34 | - libgcc-ng=11.2.0=h1234567_1 35 | - libgomp=11.2.0=h1234567_1 36 | - libiconv=1.16=h7f8727e_2 37 | - libidn2=2.3.2=h7f8727e_0 38 | - libpng=1.6.37=hbc83047_0 39 | - libsodium=1.0.18=h7b6447c_0 40 | - libstdcxx-ng=11.2.0=h1234567_1 41 | - libtasn1=4.16.0=h27cfd23_0 42 | - libtiff=4.2.0=h2818925_1 43 | - libunistring=0.9.10=h27cfd23_0 44 | - libuuid=1.0.3=h7f8727e_2 45 | - libuv=1.40.0=h7b6447c_0 46 | - libwebp=1.2.2=h55f646e_0 47 | - libwebp-base=1.2.2=h7f8727e_0 48 | - lz4-c=1.9.3=h295c915_1 49 | - mkl=2021.4.0=h06a4308_640 50 | - mkl-service=2.4.0=py310h7f8727e_0 51 | - mkl_fft=1.3.1=py310hd6ae3a3_0 52 | - mkl_random=1.2.2=py310h00e6091_0 53 | - ncurses=6.3=h7f8727e_2 54 | - nest-asyncio=1.5.5=py310h06a4308_0 55 | - nettle=3.7.3=hbbd107a_1 56 | - numpy=1.22.3=py310hfa59a62_0 57 | - numpy-base=1.22.3=py310h9585f30_0 58 | - openh264=2.1.1=h4ff587b_0 59 | - openssl=1.1.1o=h7f8727e_0 60 | - pillow=9.0.1=py310h22f2fdc_0 61 | - pure_eval=0.2.2=pyhd3eb1b0_0 62 | - pysocks=1.7.1=py310h06a4308_0 63 | - python=3.10.4=h12debd9_0 64 | - pytorch=1.11.0=py3.10_cuda11.3_cudnn8.2.0_0 65 | - pytorch-mutex=1.0=cuda 66 | - pyzmq=22.3.0=py310h295c915_2 67 | - readline=8.1.2=h7f8727e_1 68 | - setuptools=61.2.0=py310h06a4308_0 69 | - sqlite=3.38.5=hc218d9a_0 70 | - stack_data=0.2.0=pyhd3eb1b0_0 71 | - tk=8.6.12=h1ccaba5_0 72 | - torchaudio=0.11.0=py310_cu113 73 | - torchvision=0.12.0=py310_cu113 74 | - tornado=6.1=py310h7f8727e_0 75 | - typing_extensions=4.1.1=pyh06a4308_0 76 | - tzdata=2022a=hda174b7_0 77 | - urllib3=1.26.9=py310h06a4308_0 78 | - xz=5.2.5=h7f8727e_1 79 | - zeromq=4.3.4=h2531618_0 80 | - zlib=1.2.12=h7f8727e_2 81 | - zstd=1.5.2=ha4553b6_0 82 | - pip: 83 | - absl-py==1.2.0 84 | - accelerate==0.12.0 85 | - aiohttp==3.8.1 86 | - aiosignal==1.2.0 87 | - asttokens==2.0.5 88 | - async-timeout==4.0.2 89 | - attrs==21.4.0 90 | - backcall==0.2.0 91 | - bleach==5.0.1 92 | - certifi==2022.6.15 93 | - charset-normalizer==2.0.4 94 | - click==8.1.3 95 | - cloudpickle==2.2.0 96 | - cycler==0.11.0 97 | - dask==2022.10.2 98 | - datasets==2.3.2 99 | - decorator==5.1.1 100 | - dill==0.3.5.1 101 | - docker-pycreds==0.4.0 102 | - et-xmlfile==1.1.0 103 | - evaluate==0.1.2 104 | - executing==0.8.3 105 | - filelock==3.7.1 106 | - fonttools==4.34.0 107 | - frozenlist==1.3.0 108 | - fsspec==2022.5.0 109 | - gitdb==4.0.9 110 | - gitpython==3.1.27 111 | - huggingface-hub==0.8.1 112 | - idna==3.3 113 | - ipywidgets==8.0.2 114 | - joblib==1.1.0 115 | - jupyterlab-widgets==3.0.3 116 | - kiwisolver==1.4.3 117 | - locket==1.0.0 118 | - matplotlib==3.5.2 119 | - matplotlib-inline==0.1.2 120 | - multidict==6.0.2 121 | - multiprocess==0.70.13 122 | - nltk==3.7 123 | - openai==0.24.0 124 | - openpyxl==3.0.10 125 | - packaging==21.3 126 | - pandas==1.4.3 127 | - pandas-stubs==1.5.1.221024 128 | - parso==0.8.3 129 | - partd==1.3.0 130 | - pathtools==0.1.2 131 | - pexpect==4.8.0 132 | - pickleshare==0.7.5 133 | - pip==21.2.4 134 | - promise==2.3 135 | - prompt-toolkit==3.0.20 136 | - protobuf==3.20.1 137 | - psutil==5.9.1 138 | - ptyprocess==0.7.0 139 | - pure-eval==0.2.2 140 | - pyarrow==8.0.0 141 | - pycparser==2.21 142 | - pygments==2.11.2 143 | - pyopenssl==22.0.0 144 | - pyparsing==3.0.9 145 | - python-dateutil==2.8.2 146 | - pytz==2022.1 147 | - pyyaml==6.0 148 | - regex==2022.6.2 149 | - requests==2.27.1 150 | - responses==0.18.0 151 | - rouge-score==0.0.4 152 | - scikit-learn==1.1.1 153 | - scipy==1.8.1 154 | - seaborn==0.11.2 155 | - sentencepiece==0.1.96 156 | - sentry-sdk==1.9.7 157 | - setproctitle==1.3.2 158 | - shortuuid==1.0.9 159 | - six==1.16.0 160 | - sklearn==0.0 161 | - smmap==5.0.0 162 | - stack-data==0.2.0 163 | - swifter==1.3.4 164 | - threadpoolctl==3.1.0 165 | - tokenizers==0.12.1 166 | - toolz==0.12.0 167 | - torchmetrics==0.9.2 168 | - tqdm==4.64.0 169 | - traitlets==5.1.1 170 | - transformers==4.20.1 171 | - types-pytz==2022.5.0.0 172 | - typing-extensions==4.1.1 173 | - wandb==0.13.2 174 | - wcwidth==0.2.5 175 | - webencodings==0.5.1 176 | - wheel==0.37.1 177 | - widgetsnbextension==4.0.3 178 | - xxhash==3.0.0 179 | - yarl==1.7.2 180 | prefix: /users/loruhn/.conda/envs/unanswerable 181 | -------------------------------------------------------------------------------- /code/get_semantic_similarities.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import os 4 | import pickle 5 | import random 6 | 7 | import evaluate 8 | import numpy as np 9 | import torch 10 | from tqdm import tqdm 11 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 12 | 13 | import config 14 | import wandb 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--generation_model', type=str, default='opt-350m') 18 | parser.add_argument('--run_id', type=str, default='run_1') 19 | args = parser.parse_args() 20 | 21 | device = 'cuda' 22 | 23 | # Set a seed value 24 | seed_value = 10 25 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value 26 | 27 | os.environ['PYTHONHASHSEED'] = str(seed_value) 28 | # 2. Set `python` built-in pseudo-random generator at a fixed value 29 | 30 | random.seed(seed_value) 31 | # 3. Set `numpy` pseudo-random generator at a fixed value 32 | 33 | np.random.seed(seed_value) 34 | 35 | #Fix torch random seed 36 | torch.manual_seed(seed_value) 37 | 38 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache 39 | 40 | generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir) 41 | 42 | tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli") 43 | model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large-mnli").cuda() 44 | 45 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow') 46 | 47 | run_name = wandb.run.name 48 | 49 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'rb') as infile: 50 | sequences = pickle.load(infile) 51 | 52 | result_dict = {} 53 | 54 | meteor = evaluate.load('meteor') 55 | 56 | deberta_predictions = [] 57 | 58 | for sample in tqdm(sequences): 59 | question = sample['question'] 60 | if 'cleaned_generated_texts' in sample: 61 | generated_texts = sample['cleaned_generated_texts'] 62 | else: 63 | generated_texts = sample['generated_texts'] 64 | 65 | id_ = sample['id'][0] 66 | 67 | unique_generated_texts = list(set(generated_texts)) 68 | 69 | answer_list_1 = [] 70 | answer_list_2 = [] 71 | has_semantically_different_answers = False 72 | inputs = [] 73 | syntactic_similarities = {} 74 | rouge_types = ['rouge1', 'rouge2', 'rougeL'] 75 | for rouge_type in rouge_types: 76 | syntactic_similarities[rouge_type] = 0.0 77 | 78 | semantic_set_ids = {} 79 | for index, answer in enumerate(unique_generated_texts): 80 | semantic_set_ids[answer] = index 81 | 82 | print('Number of unique answers:', len(unique_generated_texts)) 83 | 84 | if len(unique_generated_texts) > 1: 85 | 86 | # Evalauate semantic similarity 87 | for i, reference_answer in enumerate(unique_generated_texts): 88 | for j in range(i + 1, len(unique_generated_texts)): 89 | 90 | answer_list_1.append(unique_generated_texts[i]) 91 | answer_list_2.append(unique_generated_texts[j]) 92 | 93 | qa_1 = question + ' ' + unique_generated_texts[i] 94 | qa_2 = question + ' ' + unique_generated_texts[j] 95 | 96 | input = qa_1 + ' [SEP] ' + qa_2 97 | inputs.append(input) 98 | encoded_input = tokenizer.encode(input, padding=True) 99 | prediction = model(torch.tensor(torch.tensor([encoded_input]), device='cuda'))['logits'] 100 | predicted_label = torch.argmax(prediction, dim=1) 101 | 102 | reverse_input = qa_2 + ' [SEP] ' + qa_1 103 | encoded_reverse_input = tokenizer.encode(reverse_input, padding=True) 104 | reverse_prediction = model(torch.tensor(torch.tensor([encoded_reverse_input]), device='cuda'))['logits'] 105 | reverse_predicted_label = torch.argmax(reverse_prediction, dim=1) 106 | 107 | deberta_prediction = 1 108 | print(qa_1, qa_2, predicted_label, reverse_predicted_label) 109 | if 0 in predicted_label or 0 in reverse_predicted_label: 110 | has_semantically_different_answers = True 111 | deberta_prediction = 0 112 | 113 | else: 114 | semantic_set_ids[unique_generated_texts[j]] = semantic_set_ids[unique_generated_texts[i]] 115 | 116 | deberta_predictions.append([unique_generated_texts[i], unique_generated_texts[j], deberta_prediction]) 117 | 118 | rouge = evaluate.load('rouge') 119 | 120 | # Evalauate syntactic similarity 121 | answer_list_1 = [] 122 | answer_list_2 = [] 123 | for i in generated_texts: 124 | for j in generated_texts: 125 | if i != j: 126 | answer_list_1.append(i) 127 | answer_list_2.append(j) 128 | 129 | results = rouge.compute(predictions=answer_list_1, references=answer_list_2) 130 | 131 | for rouge_type in rouge_types: 132 | syntactic_similarities[rouge_type] = results[rouge_type].mid.fmeasure 133 | 134 | result_dict[id_] = { 135 | 'syntactic_similarities': syntactic_similarities, 136 | 'has_semantically_different_answers': has_semantically_different_answers 137 | } 138 | list_of_semantic_set_ids = [semantic_set_ids[x] for x in generated_texts] 139 | result_dict[id_]['semantic_set_ids'] = list_of_semantic_set_ids 140 | 141 | with open('deberta_predictions_{}.csv'.format(args.run_id), 'w', encoding='UTF8', newline='') as f: 142 | writer = csv.writer(f) 143 | # write the header 144 | writer.writerow(['qa_1', 'qa_2', 'prediction']) 145 | writer.writerows(deberta_predictions) 146 | 147 | print(result_dict) 148 | 149 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations_similarities.pkl', 'wb') as outfile: 150 | pickle.dump(result_dict, outfile) 151 | -------------------------------------------------------------------------------- /code/get_likelihoods.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import random 5 | 6 | import numpy as np 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | import wandb 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--evaluation_model', type=str, default='opt-350m') 14 | parser.add_argument('--generation_model', type=str, default='opt-350m') 15 | parser.add_argument('--run_id', type=str, default='run_1') 16 | args = parser.parse_args() 17 | 18 | device = 'cuda' 19 | import config 20 | 21 | # Set a seed value 22 | seed_value = 10 23 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value 24 | 25 | os.environ['PYTHONHASHSEED'] = str(seed_value) 26 | # 2. Set `python` built-in pseudo-random generator at a fixed value 27 | 28 | random.seed(seed_value) 29 | # 3. Set `numpy` pseudo-random generator at a fixed value 30 | 31 | np.random.seed(seed_value) 32 | 33 | #Fix torch random seed 34 | torch.manual_seed(seed_value) 35 | 36 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache 37 | 38 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{args.evaluation_model}", 39 | torch_dtype=torch.float16, 40 | cache_dir=config.data_dir).cuda() 41 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/{args.evaluation_model}", 42 | use_fast=False, 43 | cache_dir=config.data_dir) 44 | 45 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow') 46 | 47 | run_name = wandb.run.name 48 | 49 | opt_models = ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b', 'opt-6.7b', 'opt-13b', 'opt-30b'] 50 | 51 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'rb') as infile: 52 | sequences = pickle.load(infile) 53 | 54 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations_similarities.pkl', 'rb') as infile: 55 | similarities_dict = pickle.load(infile) 56 | 57 | 58 | def get_neg_loglikelihoods(model, sequences): 59 | 60 | with torch.no_grad(): 61 | result = [] 62 | for sample in sequences: 63 | result_dict = {} 64 | prompt = sample['prompt'] 65 | if 'cleaned_generations' in sample: 66 | generations = sample['cleaned_generations'].to(device) 67 | else: 68 | generations = sample['generations'].to(device) 69 | id_ = sample['id'] 70 | 71 | average_neg_log_likelihoods = torch.zeros((generations.shape[0],)) 72 | average_unconditioned_neg_log_likelihoods = torch.zeros((generations.shape[0],)) 73 | neg_log_likelihoods = torch.zeros((generations.shape[0],)) 74 | neg_unconditioned_log_likelihoods = torch.zeros((generations.shape[0],)) 75 | pointwise_mutual_information = torch.zeros((generations.shape[0],)) 76 | sequence_embeddings = [] 77 | 78 | for generation_index in range(generations.shape[0]): 79 | prompt = prompt[prompt != tokenizer.pad_token_id] 80 | generation = generations[generation_index][generations[generation_index] != tokenizer.pad_token_id] 81 | 82 | # This computation of the negative log likelihoods follows this tutorial: https://huggingface.co/docs/transformers/perplexity 83 | target_ids = generation.clone() 84 | target_ids[:len(prompt)] = -100 85 | model_output = model(torch.reshape(generation, (1, -1)), labels=target_ids, output_hidden_states=True) 86 | generation_only = generation.clone()[(len(prompt) - 1):] 87 | unconditioned_model_output = model(torch.reshape(generation_only, (1, -1)), 88 | labels=generation_only, 89 | output_hidden_states=True) 90 | hidden_states = model_output['hidden_states'] 91 | average_neg_log_likelihood = model_output['loss'] 92 | 93 | average_unconditioned_neg_log_likelihood = unconditioned_model_output['loss'] 94 | average_neg_log_likelihoods[generation_index] = average_neg_log_likelihood 95 | average_unconditioned_neg_log_likelihoods[generation_index] = average_unconditioned_neg_log_likelihood 96 | neg_log_likelihoods[generation_index] = average_neg_log_likelihood * (len(generation) - len(prompt)) 97 | neg_unconditioned_log_likelihoods[generation_index] = average_unconditioned_neg_log_likelihood * ( 98 | len(generation) - len(prompt)) 99 | pointwise_mutual_information[generation_index] = -neg_log_likelihoods[ 100 | generation_index] + neg_unconditioned_log_likelihoods[generation_index] 101 | 102 | average_of_last_layer_token_embeddings = torch.mean(hidden_states[-1], dim=1) 103 | sequence_embeddings.append(average_of_last_layer_token_embeddings) 104 | 105 | most_likely_generation = sample['most_likely_generation_ids'].to(device) 106 | target_ids = most_likely_generation.clone() 107 | target_ids[:len(prompt)] = -100 108 | model_output = model(torch.reshape(most_likely_generation, (1, -1)), 109 | labels=target_ids, 110 | output_hidden_states=True) 111 | hidden_states = model_output['hidden_states'] 112 | average_neg_log_likelihood_of_most_likely_gen = model_output['loss'] 113 | most_likely_generation_embedding = torch.mean(hidden_states[-1], dim=1) 114 | 115 | second_most_likely_generation = sample['second_most_likely_generation_ids'].to(device) 116 | target_ids = second_most_likely_generation.clone() 117 | target_ids[:len(prompt)] = -100 118 | model_output = model(torch.reshape(second_most_likely_generation, (1, -1)), 119 | labels=target_ids, 120 | output_hidden_states=True) 121 | hidden_states = model_output['hidden_states'] 122 | average_neg_log_likelihood_of_second_most_likely_gen = model_output['loss'] 123 | second_most_likely_generation_embedding = torch.mean(hidden_states[-1], dim=1) 124 | 125 | neg_log_likelihood_of_most_likely_gen = average_neg_log_likelihood_of_most_likely_gen * ( 126 | len(most_likely_generation) - len(prompt)) 127 | 128 | sequence_embeddings = torch.stack(sequence_embeddings) 129 | result_dict['prompt'] = prompt 130 | result_dict['generations'] = generations 131 | result_dict['average_neg_log_likelihoods'] = average_neg_log_likelihoods 132 | result_dict['neg_log_likelihoods'] = neg_log_likelihoods 133 | result_dict['sequence_embeddings'] = most_likely_generation_embedding 134 | result_dict['most_likely_sequence_embedding'] = most_likely_generation 135 | result_dict['average_unconditioned_neg_log_likelihoods'] = average_unconditioned_neg_log_likelihoods 136 | result_dict['neg_unconditioned_log_likelihoods'] = neg_unconditioned_log_likelihoods 137 | result_dict['pointwise_mutual_information'] = pointwise_mutual_information 138 | result_dict['average_neg_log_likelihood_of_most_likely_gen'] = average_neg_log_likelihood_of_most_likely_gen 139 | result_dict[ 140 | 'average_neg_log_likelihood_of_second_most_likely_gen'] = average_neg_log_likelihood_of_second_most_likely_gen 141 | result_dict['neg_log_likelihood_of_most_likely_gen'] = neg_log_likelihood_of_most_likely_gen 142 | result_dict['semantic_set_ids'] = torch.tensor(similarities_dict[id_[0]]['semantic_set_ids'], device=device) 143 | result_dict['id'] = id_ 144 | result.append(result_dict) 145 | 146 | return result 147 | 148 | 149 | likelihoods = get_neg_loglikelihoods(model, sequences) 150 | 151 | with open(f'{config.data_dir}/{run_name}/{args.generation_model}_generations_{args.evaluation_model}_likelihoods.pkl', 152 | 'wb') as outfile: 153 | pickle.dump(likelihoods, outfile) 154 | -------------------------------------------------------------------------------- /code/compute_confidence_measure.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import random 5 | 6 | import config 7 | import numpy as np 8 | import torch 9 | import wandb 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--generation_model', type=str, default='opt-350m') 13 | parser.add_argument('--evaluation_model', type=str, default='opt-350m') 14 | parser.add_argument('--run_id', type=str, default='run_1') 15 | parser.add_argument('--verbose', type=bool, default=True) 16 | args = parser.parse_args() 17 | 18 | device = 'cuda' 19 | 20 | # Set a seed value 21 | seed_value = 10 22 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value 23 | 24 | os.environ['PYTHONHASHSEED'] = str(seed_value) 25 | # 2. Set `python` built-in pseudo-random generator at a fixed value 26 | 27 | random.seed(seed_value) 28 | # 3. Set `numpy` pseudo-random generator at a fixed value 29 | 30 | np.random.seed(seed_value) 31 | 32 | #Fix torch random seed 33 | torch.manual_seed(seed_value) 34 | 35 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache 36 | 37 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow') 38 | 39 | run_name = wandb.run.name 40 | 41 | llh_shift = torch.tensor(5.0) 42 | 43 | 44 | def get_overall_log_likelihoods(list_of_results): 45 | """Compute log likelihood of all generations under their given context. 46 | 47 | list_of_results: list of dictionaries with keys: 48 | 49 | returns: dictionary with keys: 'neg_log_likelihoods', 'average_neg_log_likelihoods' 50 | that contains tensors of shape (num_models, num_generations, num_samples_per_generation) 51 | """ 52 | 53 | result_dict = {} 54 | 55 | list_of_keys = ['neg_log_likelihoods', 'average_neg_log_likelihoods', 'sequence_embeddings',\ 56 | 'pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen',\ 57 | 'neg_log_likelihood_of_most_likely_gen', 'semantic_set_ids'] 58 | 59 | for key in list_of_keys: 60 | list_of_ids = [] 61 | overall_results = [] 62 | for model_size, result in list_of_results: 63 | results_per_model = [] 64 | for sample in result: 65 | average_neg_log_likelihoods = sample[key] 66 | list_of_ids.append(sample['id'][0]) 67 | results_per_model.append(average_neg_log_likelihoods) 68 | 69 | results_per_model = torch.stack(results_per_model) 70 | 71 | overall_results.append(results_per_model) 72 | 73 | if key != 'sequence_embeddings': 74 | overall_results = torch.stack(overall_results) 75 | 76 | result_dict[key] = overall_results 77 | 78 | result_dict['ids'] = list_of_ids 79 | return result_dict 80 | 81 | 82 | def get_mutual_information(log_likelihoods): 83 | """Compute confidence measure for a given set of likelihoods""" 84 | 85 | mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0])) 86 | tiled_mean = mean_across_models.tile(log_likelihoods.shape[0], 1, 1) 87 | diff_term = torch.exp(log_likelihoods) * log_likelihoods - torch.exp(tiled_mean) * tiled_mean 88 | f_j = torch.div(torch.sum(diff_term, dim=0), diff_term.shape[0]) 89 | mutual_information = torch.div(torch.sum(torch.div(f_j, mean_across_models), dim=1), f_j.shape[-1]) 90 | 91 | return mutual_information 92 | 93 | 94 | def get_log_likelihood_variance(neg_log_likelihoods): 95 | """Compute log likelihood variance of approximate posterior predictive""" 96 | mean_across_models = torch.mean(neg_log_likelihoods, dim=0) 97 | variance_of_neg_log_likelihoods = torch.var(mean_across_models, dim=1) 98 | 99 | return variance_of_neg_log_likelihoods 100 | 101 | 102 | def get_log_likelihood_mean(neg_log_likelihoods): 103 | """Compute softmax variance of approximate posterior predictive""" 104 | mean_across_models = torch.mean(neg_log_likelihoods, dim=0) 105 | mean_of_neg_log_likelihoods = torch.mean(mean_across_models, dim=1) 106 | 107 | return mean_of_neg_log_likelihoods 108 | 109 | 110 | def get_mean_of_poinwise_mutual_information(pointwise_mutual_information): 111 | """Compute mean of pointwise mutual information""" 112 | mean_across_models = torch.mean(pointwise_mutual_information, dim=0) 113 | return torch.mean(mean_across_models, dim=1) 114 | 115 | 116 | def get_predictive_entropy(log_likelihoods): 117 | """Compute predictive entropy of approximate posterior predictive""" 118 | mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0])) 119 | entropy = -torch.sum(mean_across_models, dim=1) / torch.tensor(mean_across_models.shape[1]) 120 | return entropy 121 | 122 | 123 | def get_predictive_entropy_over_concepts(log_likelihoods, semantic_set_ids): 124 | """Compute the semantic entropy""" 125 | mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0])) 126 | # This is ok because all the models have the same semantic set ids 127 | semantic_set_ids = semantic_set_ids[0] 128 | entropies = [] 129 | for row_index in range(mean_across_models.shape[0]): 130 | aggregated_likelihoods = [] 131 | row = mean_across_models[row_index] 132 | semantic_set_ids_row = semantic_set_ids[row_index] 133 | for semantic_set_id in torch.unique(semantic_set_ids_row): 134 | aggregated_likelihoods.append(torch.logsumexp(row[semantic_set_ids_row == semantic_set_id], dim=0)) 135 | aggregated_likelihoods = torch.tensor(aggregated_likelihoods) - llh_shift 136 | entropy = - torch.sum(aggregated_likelihoods, dim=0) / torch.tensor(aggregated_likelihoods.shape[0]) 137 | entropies.append(entropy) 138 | 139 | return torch.tensor(entropies) 140 | 141 | 142 | def get_margin_probability_uncertainty_measure(log_likelihoods): 143 | """Compute margin probability uncertainty measure""" 144 | mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0])) 145 | topk_likelihoods, indices = torch.topk(mean_across_models, 2, dim=1, sorted=True) 146 | margin_probabilities = np.exp(topk_likelihoods[:, 0]) - np.exp(topk_likelihoods[:, 1]) 147 | 148 | return margin_probabilities 149 | 150 | 151 | list_of_results = [] 152 | 153 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations_{args.evaluation_model}_likelihoods.pkl', 154 | 'rb') as infile: 155 | sequences = pickle.load(infile) 156 | list_of_results.append((args.evaluation_model, sequences)) 157 | 158 | overall_results = get_overall_log_likelihoods(list_of_results) 159 | mutual_information = get_mutual_information(-overall_results['neg_log_likelihoods']) 160 | predictive_entropy = get_predictive_entropy(-overall_results['neg_log_likelihoods']) 161 | predictive_entropy_over_concepts = get_predictive_entropy_over_concepts(-overall_results['average_neg_log_likelihoods'], 162 | overall_results['semantic_set_ids']) 163 | unnormalised_entropy_over_concepts = get_predictive_entropy_over_concepts(-overall_results['neg_log_likelihoods'], 164 | overall_results['semantic_set_ids']) 165 | 166 | margin_measures = get_margin_probability_uncertainty_measure(-overall_results['average_neg_log_likelihoods']) 167 | unnormalised_margin_measures = get_margin_probability_uncertainty_measure(-overall_results['neg_log_likelihoods']) 168 | 169 | 170 | def get_number_of_unique_elements_per_row(tensor): 171 | assert len(tensor.shape) == 2 172 | return torch.count_nonzero(torch.sum(torch.nn.functional.one_hot(tensor), dim=1), dim=1) 173 | 174 | 175 | number_of_semantic_sets = get_number_of_unique_elements_per_row(overall_results['semantic_set_ids'][0]) 176 | average_predictive_entropy = get_predictive_entropy(-overall_results['average_neg_log_likelihoods']) 177 | average_predictive_entropy_on_subsets = [] 178 | predictive_entropy_on_subsets = [] 179 | semantic_predictive_entropy_on_subsets = [] 180 | num_predictions = overall_results['average_neg_log_likelihoods'].shape[-1] 181 | number_of_semantic_sets_on_subsets = [] 182 | for i in range(1, num_predictions + 1): 183 | offset = num_predictions * (i / 100) 184 | average_predictive_entropy_on_subsets.append( 185 | get_predictive_entropy(-overall_results['average_neg_log_likelihoods'][:, :, :int(i)])) 186 | predictive_entropy_on_subsets.append(get_predictive_entropy(-overall_results['neg_log_likelihoods'][:, :, :int(i)])) 187 | semantic_predictive_entropy_on_subsets.append( 188 | get_predictive_entropy_over_concepts(-overall_results['average_neg_log_likelihoods'][:, :, :int(i)], 189 | overall_results['semantic_set_ids'][:, :, :int(i)])) 190 | number_of_semantic_sets_on_subsets.append( 191 | get_number_of_unique_elements_per_row(overall_results['semantic_set_ids'][0][:, :i])) 192 | 193 | average_pointwise_mutual_information = get_mean_of_poinwise_mutual_information( 194 | overall_results['pointwise_mutual_information']) 195 | 196 | overall_results['mutual_information'] = mutual_information 197 | overall_results['predictive_entropy'] = predictive_entropy 198 | overall_results['predictive_entropy_over_concepts'] = predictive_entropy_over_concepts 199 | overall_results['unnormalised_entropy_over_concepts'] = unnormalised_entropy_over_concepts 200 | overall_results['number_of_semantic_sets'] = number_of_semantic_sets 201 | overall_results['margin_measures'] = margin_measures 202 | overall_results['unnormalised_margin_measures'] = unnormalised_margin_measures 203 | 204 | overall_results['average_predictive_entropy'] = average_predictive_entropy 205 | for i in range(len(average_predictive_entropy_on_subsets)): 206 | overall_results[f'average_predictive_entropy_on_subset_{i + 1}'] = average_predictive_entropy_on_subsets[i] 207 | overall_results[f'predictive_entropy_on_subset_{i + 1}'] = predictive_entropy_on_subsets[i] 208 | overall_results[f'semantic_predictive_entropy_on_subset_{i + 1}'] = semantic_predictive_entropy_on_subsets[i] 209 | overall_results[f'number_of_semantic_sets_on_subset_{i + 1}'] = number_of_semantic_sets_on_subsets[i] 210 | overall_results['average_pointwise_mutual_information'] = average_pointwise_mutual_information 211 | 212 | with open(f'{config.output_dir}/{run_name}/aggregated_likelihoods_{args.generation_model}_generations.pkl', 213 | 'wb') as outfile: 214 | pickle.dump(overall_results, outfile) 215 | 216 | if args.verbose: 217 | print('Margin measure', margin_measures) 218 | print('Number of semantic sets', number_of_semantic_sets) 219 | print('predicitve entropy shape: ', predictive_entropy.shape) 220 | print('predicitve entropy per concept shape: ', predictive_entropy_over_concepts.shape) 221 | print(overall_results['average_neg_log_likelihoods'].shape) 222 | print(len(number_of_semantic_sets_on_subsets)) 223 | print(number_of_semantic_sets_on_subsets[0].shape) 224 | print('average predictive entropy on subsets: ', len(average_predictive_entropy_on_subsets)) 225 | print(average_predictive_entropy_on_subsets[0].shape) 226 | print(overall_results['pointwise_mutual_information']) 227 | print(overall_results['margin_measures']) 228 | -------------------------------------------------------------------------------- /code/generate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pathlib 4 | import pickle 5 | from lib2to3.pgen2.tokenize import tokenize 6 | 7 | import accelerate 8 | import config 9 | import datasets 10 | import evaluate 11 | import numpy as np 12 | import torch 13 | import tqdm 14 | import wandb 15 | from transformers import AutoModelForCausalLM, AutoTokenizer 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--type_of_question', type=str) 19 | parser.add_argument('--num_generations_per_prompt', type=int, default=5) 20 | parser.add_argument('--fraction_of_data_to_use', type=float, default=0.9) 21 | parser.add_argument('--model', type=str, default='opt-350m') 22 | parser.add_argument('--run_id', type=str, default='run_1') 23 | parser.add_argument('--temperature', type=float, default='1.0') 24 | parser.add_argument('--num_beams', type=int, default='5') 25 | parser.add_argument('--decoding_method', type=str, default='beam_search') 26 | parser.add_argument('--top_p', type=float, default=1.0) 27 | parser.add_argument('--dataset', type=str, default='coqa') 28 | args = parser.parse_args() 29 | 30 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow') 31 | 32 | run_name = wandb.run.name 33 | 34 | device = 'cuda' 35 | 36 | # Set a seed value 37 | seed_value = 10 38 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value 39 | import os 40 | 41 | os.environ['PYTHONHASHSEED'] = str(seed_value) 42 | # 2. Set `python` built-in pseudo-random generator at a fixed value 43 | import random 44 | 45 | random.seed(seed_value) 46 | # 3. Set `numpy` pseudo-random generator at a fixed value 47 | np.random.seed(seed_value) 48 | 49 | #Fix torch random seed 50 | torch.manual_seed(seed_value) 51 | 52 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache 53 | 54 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{args.model}", 55 | torch_dtype=torch.float16, 56 | cache_dir=config.hf_cache_dir).cuda() 57 | 58 | if args.model == 'opt-30b': 59 | accelerate.dispatch_model(model, device_map=config.device_map) 60 | 61 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/{args.model}", use_fast=False, cache_dir=config.hf_cache_dir) 62 | 63 | opt_models = ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b', 'opt-6.7b', 'opt-13b', 'opt-30b'] 64 | 65 | if args.dataset == 'coqa': 66 | dataset = datasets.load_from_disk(f'{config.output_dir}/coqa_dataset') 67 | id_to_question_mapping = dict(zip(dataset['id'], dataset['question'])) 68 | elif args.dataset == 'trivia_qa': 69 | dataset = datasets.load_from_disk(f'{config.output_dir}/trivia_qa') 70 | 71 | if args.fraction_of_data_to_use < 1.0: 72 | train_dataset = dataset.train_test_split(test_size=(1 - args.fraction_of_data_to_use), seed=seed_value)['train'] 73 | else: 74 | train_dataset = dataset 75 | 76 | 77 | def encode(examples): 78 | return tokenizer(examples['story'] + ' Q: ' + examples['question'] + ' A:', truncation=False, padding=False) 79 | 80 | 81 | def encode_and_format_dataset(dataset): 82 | dataset = dataset.map(encode, batched=False, load_from_cache_file=False) 83 | dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'], output_all_columns=True) 84 | 85 | return dataset 86 | 87 | 88 | if args.dataset == 'coqa': 89 | questions = encode_and_format_dataset(train_dataset) 90 | elif args.dataset == 'trivia_qa': 91 | questions = train_dataset 92 | 93 | dataloader = torch.utils.data.DataLoader(questions, batch_size=1) 94 | 95 | period_token_id = tokenizer('. ')['input_ids'][1] 96 | eos_tokens = ['Question:', ' Question:', '\n', 'Answer:', ' Answer:', 'Q:'] 97 | question_framing_ids = [[tokenizer(eos_token)['input_ids'][1]] for eos_token in eos_tokens] 98 | squad_metric = evaluate.load("squad") 99 | rouge = evaluate.load('rouge') 100 | exact_match_metric = evaluate.load("exact_match") 101 | 102 | 103 | def get_generations(model, dataloader, number_of_generations): 104 | """For a given model, produce a number of generation """ 105 | 106 | with torch.no_grad(): 107 | max_length_of_generated_sequence = 256 108 | sequences = [] 109 | for batch in tqdm.tqdm(dataloader): 110 | 111 | input_ids = torch.cat(batch['input_ids']).to(device).reshape( 112 | 1, -1) if args.dataset == 'trivia_qa' else batch['input_ids'].to(device) 113 | if args.decoding_method == 'beam_search': 114 | most_likely_generation = model.generate(input_ids, 115 | num_beams=5, 116 | num_return_sequences=2, 117 | do_sample=False, 118 | max_length=input_ids.shape[1] + 119 | max_length_of_generated_sequence, 120 | eos_token_id=period_token_id, 121 | bad_words_ids=question_framing_ids) 122 | elif args.decoding_method == 'greedy': 123 | most_likely_generation = model.generate(input_ids, 124 | num_beams=1, 125 | do_sample=False, 126 | max_length=input_ids.shape[1] + 127 | max_length_of_generated_sequence, 128 | eos_token_id=period_token_id, 129 | bad_words_ids=question_framing_ids) 130 | 131 | input_length = input_ids.shape[1] if args.dataset == 'trivia_qa' else batch['input_ids'].shape[1] 132 | generations = torch.ones((number_of_generations, input_length + max_length_of_generated_sequence), 133 | dtype=torch.long, 134 | device=device) 135 | for i in range(number_of_generations): 136 | 137 | generation = model.generate(input_ids, 138 | do_sample=True, 139 | num_return_sequences=1, 140 | num_beams=args.num_beams, 141 | max_length=input_ids.shape[1] + max_length_of_generated_sequence, 142 | eos_token_id=period_token_id, 143 | temperature=args.temperature, 144 | bad_words_ids=question_framing_ids, 145 | top_p=args.top_p) 146 | generations[i, :generation.shape[1]] = generation 147 | 148 | generations = torch.reshape(generations, (-1, number_of_generations, generations.shape[-1])) 149 | for i in range(generations.shape[0]): 150 | 151 | if args.dataset == 'coqa': 152 | sequence_dict = { 153 | 'prompt': batch['input_ids'][i].to('cpu'), 154 | 'generations': generations[i].to('cpu'), 155 | 'id': batch['id'], 156 | 'question': id_to_question_mapping[batch['id'][0]] 157 | } 158 | elif args.dataset == 'trivia_qa': 159 | few_shot_question = tokenizer.decode(input_ids[0]) 160 | question = few_shot_question.split('Question: ')[-1].split('Answer: ')[0] 161 | sequence_dict = { 162 | 'prompt': input_ids[0], 163 | 'generations': generations[i], 164 | 'id': batch['question_id'], 165 | 'few_shot_question': tokenizer.decode(input_ids[0]), 166 | 'question': question 167 | } 168 | 169 | generated_texts = [] 170 | for generation in generations[i]: 171 | generated_texts.append( 172 | tokenizer.decode(generation[len(batch['input_ids'][i]):], skip_special_tokens=True)) 173 | 174 | sequence_dict['generated_texts'] = generated_texts 175 | sequence_dict['most_likely_generation_ids'] = most_likely_generation[0].to('cpu') 176 | sequence_dict['most_likely_generation'] = tokenizer.decode( 177 | most_likely_generation[0][len(batch['input_ids'][i]):], skip_special_tokens=True) 178 | 179 | sequence_dict['second_most_likely_generation_ids'] = most_likely_generation[1].to('cpu') 180 | sequence_dict['second_most_likely_generation'] = tokenizer.decode( 181 | most_likely_generation[1][len(batch['input_ids'][i]):], skip_special_tokens=True) 182 | 183 | sequence_dict['semantic_variability_reference_answers'] = batch[ 184 | 'semantic_variability'] if 'semantic_variability' in batch else None 185 | rouge_types = ['rouge1', 'rouge2', 'rougeL'] 186 | for rouge_type in rouge_types: 187 | if rouge_type in batch: 188 | sequence_dict[rouge_type + '_reference_answers'] = batch[rouge_type] 189 | 190 | else: 191 | sequence_dict[rouge_type + '_reference_answers'] = None 192 | 193 | sequence_dict[rouge_type + '_to_target'] = 0.0 194 | 195 | sequence_dict['answer'] = batch['answer']['text'] if args.dataset == 'coqa' else batch['answer'] 196 | sequence_dict['additional_answers'] = [x[0] for x in batch['additional_answers'] 197 | ] if args.dataset == 'coqa' else None 198 | 199 | sequence_dict['exact_match'] = 0.0 200 | 201 | reference_answers = batch['answer']['text'] + [x[0] for x in batch['additional_answers'] 202 | ] if args.dataset == 'coqa' else batch['answer'] 203 | 204 | for answer in reference_answers: 205 | predictions = [sequence_dict['most_likely_generation'].lstrip()] 206 | references = [answer] 207 | results = exact_match_metric.compute(predictions=predictions, 208 | references=references, 209 | ignore_case=True, 210 | ignore_punctuation=True) 211 | sequence_dict['exact_match'] = max(results['exact_match'], sequence_dict['exact_match']) 212 | rouge_results = rouge.compute(predictions=predictions, references=references) 213 | for rouge_type in rouge_types: 214 | sequence_dict[rouge_type + '_to_target'] = max(rouge_results[rouge_type].mid.fmeasure, 215 | sequence_dict[rouge_type + '_to_target']) 216 | 217 | sequences.append(sequence_dict) 218 | 219 | return sequences 220 | 221 | 222 | sequences = get_generations(model, dataloader, args.num_generations_per_prompt) 223 | 224 | pathlib.Path(f'{config.output_dir}/sequences/' + run_name).mkdir(parents=True, exist_ok=True) 225 | 226 | with open(f'{config.output_dir}/sequences/{run_name}/{args.model}_generations.pkl', 'wb') as outfile: 227 | pickle.dump(sequences, outfile) 228 | -------------------------------------------------------------------------------- /code/analyze_results.py: -------------------------------------------------------------------------------- 1 | # parse arguments 2 | import argparse 3 | import json 4 | import pickle 5 | 6 | import config 7 | import numpy as np 8 | import pandas as pd 9 | import sklearn 10 | import sklearn.metrics 11 | import torch 12 | import wandb 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-n', '--run_ids', nargs='+', default=[]) 16 | parser.add_argument('--verbose', type=bool, default=True) 17 | args = parser.parse_args() 18 | 19 | overall_result_dict = {} 20 | 21 | aurocs_across_models = [] 22 | 23 | sequence_embeddings_dict = {} 24 | 25 | run_ids_to_analyze = args.run_ids 26 | for run_id in run_ids_to_analyze: 27 | 28 | wandb.init(project='nlg_uncertainty', id=run_id, resume='allow') 29 | run_name = wandb.run.name 30 | model_name = wandb.config.model 31 | print(run_name) 32 | 33 | def get_similarities_df(): 34 | """Get the similarities df from the pickle file""" 35 | with open(f'{config.output_dir}/{run_name}/{model_name}_generations_similarities.pkl', 'rb') as f: 36 | similarities = pickle.load(f) 37 | similarities_df = pd.DataFrame.from_dict(similarities, orient='index') 38 | similarities_df['id'] = similarities_df.index 39 | similarities_df['has_semantically_different_answers'] = similarities_df[ 40 | 'has_semantically_different_answers'].astype('int') 41 | similarities_df['rougeL_among_generations'] = similarities_df['syntactic_similarities'].apply( 42 | lambda x: x['rougeL']) 43 | 44 | return similarities_df 45 | 46 | def get_generations_df(): 47 | """Get the generations df from the pickle file""" 48 | with open(f'{config.output_dir}/{run_name}/{model_name}_generations.pkl', 'rb') as infile: 49 | generations = pickle.load(infile) 50 | generations_df = pd.DataFrame(generations) 51 | generations_df['id'] = generations_df['id'].apply(lambda x: x[0]) 52 | generations_df['id'] = generations_df['id'].astype('object') 53 | if not generations_df['semantic_variability_reference_answers'].isnull().values.any(): 54 | generations_df['semantic_variability_reference_answers'] = generations_df[ 55 | 'semantic_variability_reference_answers'].apply(lambda x: x[0].item()) 56 | 57 | if not generations_df['rougeL_reference_answers'].isnull().values.any(): 58 | generations_df['rougeL_reference_answers'] = generations_df['rougeL_reference_answers'].apply( 59 | lambda x: x[0].item()) 60 | generations_df['length_of_most_likely_generation'] = generations_df['most_likely_generation'].apply( 61 | lambda x: len(str(x).split(' '))) 62 | generations_df['length_of_answer'] = generations_df['answer'].apply(lambda x: len(str(x).split(' '))) 63 | generations_df['variance_of_length_of_generations'] = generations_df['generated_texts'].apply( 64 | lambda x: np.var([len(str(y).split(' ')) for y in x])) 65 | generations_df['correct'] = (generations_df['rougeL_to_target'] > 0.3).astype('int') 66 | 67 | return generations_df 68 | 69 | def get_likelihoods_df(): 70 | """Get the likelihoods df from the pickle file""" 71 | 72 | with open(f'{config.output_dir}/{run_name}/aggregated_likelihoods_{model_name}_generations.pkl', 'rb') as f: 73 | likelihoods = pickle.load(f) 74 | print(likelihoods.keys()) 75 | 76 | subset_keys = ['average_predictive_entropy_on_subset_' + str(i) for i in range(1, num_generations + 1)] 77 | subset_keys += ['predictive_entropy_on_subset_' + str(i) for i in range(1, num_generations + 1)] 78 | subset_keys += ['semantic_predictive_entropy_on_subset_' + str(i) for i in range(1, num_generations + 1)] 79 | subset_keys += ['number_of_semantic_sets_on_subset_' + str(i) for i in range(1, num_generations + 1)] 80 | 81 | keys_to_use = ('ids', 'predictive_entropy', 'mutual_information', 'average_predictive_entropy',\ 82 | 'average_pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen',\ 83 | 'average_neg_log_likelihood_of_second_most_likely_gen', 'neg_log_likelihood_of_most_likely_gen',\ 84 | 'predictive_entropy_over_concepts', 'number_of_semantic_sets', 'unnormalised_entropy_over_concepts') 85 | 86 | likelihoods_small = dict((k, likelihoods[k]) for k in keys_to_use + tuple(subset_keys)) 87 | for key in likelihoods_small: 88 | if key == 'average_predictive_entropy_on_subsets': 89 | likelihoods_small[key].shape 90 | if type(likelihoods_small[key]) is torch.Tensor: 91 | likelihoods_small[key] = torch.squeeze(likelihoods_small[key].cpu()) 92 | 93 | sequence_embeddings = likelihoods['sequence_embeddings'] 94 | 95 | likelihoods_df = pd.DataFrame.from_dict(likelihoods_small) 96 | 97 | likelihoods_df.rename(columns={'ids': 'id'}, inplace=True) 98 | 99 | return likelihoods_df, sequence_embeddings 100 | 101 | similarities_df = get_similarities_df() 102 | generations_df = get_generations_df() 103 | num_generations = len(generations_df['generated_texts'][0]) 104 | likelihoods_df, sequence_embeddings = get_likelihoods_df() 105 | result_df = generations_df.merge(similarities_df, on='id').merge(likelihoods_df, on='id') 106 | 107 | n_samples_before_filtering = len(result_df) 108 | result_df['len_most_likely_generation_length'] = result_df['most_likely_generation'].apply(lambda x: len(x.split())) 109 | 110 | # Begin analysis 111 | result_dict = {} 112 | result_dict['accuracy'] = result_df['correct'].mean() 113 | 114 | # Compute the auroc for the length normalized predictive entropy 115 | ln_predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'], 116 | result_df['average_predictive_entropy']) 117 | result_dict['ln_predictive_entropy_auroc'] = ln_predictive_entropy_auroc 118 | 119 | predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'], result_df['predictive_entropy']) 120 | result_dict['predictive_entropy_auroc'] = predictive_entropy_auroc 121 | 122 | entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'], 123 | result_df['predictive_entropy_over_concepts']) 124 | result_dict['entropy_over_concepts_auroc'] = entropy_over_concepts_auroc 125 | 126 | if 'unnormalised_entropy_over_concepts' in result_df.columns: 127 | unnormalised_entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score( 128 | 1 - result_df['correct'], result_df['unnormalised_entropy_over_concepts']) 129 | result_dict['unnormalised_entropy_over_concepts_auroc'] = unnormalised_entropy_over_concepts_auroc 130 | 131 | aurocs_across_models.append(entropy_over_concepts_auroc) 132 | 133 | neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'], 134 | result_df['neg_log_likelihood_of_most_likely_gen']) 135 | result_dict['neg_llh_most_likely_gen_auroc'] = neg_llh_most_likely_gen_auroc 136 | 137 | number_of_semantic_sets_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'], 138 | result_df['number_of_semantic_sets']) 139 | result_dict['number_of_semantic_sets_auroc'] = number_of_semantic_sets_auroc 140 | 141 | result_dict['number_of_semantic_sets_correct'] = result_df[result_df['correct'] == 142 | 1]['number_of_semantic_sets'].mean() 143 | result_dict['number_of_semantic_sets_incorrect'] = result_df[result_df['correct'] == 144 | 0]['number_of_semantic_sets'].mean() 145 | 146 | result_dict['average_rougeL_among_generations'] = result_df['rougeL_among_generations'].mean() 147 | result_dict['average_rougeL_among_generations_correct'] = result_df[result_df['correct'] == 148 | 1]['rougeL_among_generations'].mean() 149 | result_dict['average_rougeL_among_generations_incorrect'] = result_df[result_df['correct'] == 150 | 0]['rougeL_among_generations'].mean() 151 | result_dict['average_rougeL_auroc'] = sklearn.metrics.roc_auc_score(result_df['correct'], 152 | result_df['rougeL_among_generations']) 153 | 154 | average_neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score( 155 | 1 - result_df['correct'], result_df['average_neg_log_likelihood_of_most_likely_gen']) 156 | result_dict['average_neg_llh_most_likely_gen_auroc'] = average_neg_llh_most_likely_gen_auroc 157 | result_dict['rougeL_based_accuracy'] = result_df['correct'].mean() 158 | 159 | result_dict['margin_measure_auroc'] = sklearn.metrics.roc_auc_score( 160 | 1 - result_df['correct'], result_df['average_neg_log_likelihood_of_most_likely_gen'] + 161 | result_df['average_neg_log_likelihood_of_second_most_likely_gen']) 162 | 163 | if args.verbose: 164 | print('Number of samples:', len(result_df)) 165 | print(result_df['predictive_entropy'].mean()) 166 | print(result_df['average_predictive_entropy'].mean()) 167 | print(result_df['predictive_entropy_over_concepts'].mean()) 168 | print('ln_predictive_entropy_auroc', ln_predictive_entropy_auroc) 169 | print('semantci entropy auroc', entropy_over_concepts_auroc) 170 | print( 171 | 'Semantic entropy +', 172 | sklearn.metrics.roc_auc_score( 173 | 1 - result_df['correct'], 174 | result_df['predictive_entropy_over_concepts'] - 3 * result_df['rougeL_among_generations'])) 175 | print('RougeL among generations auroc', 176 | sklearn.metrics.roc_auc_score(result_df['correct'], result_df['rougeL_among_generations'])) 177 | print('margin measure auroc:', result_dict['margin_measure_auroc']) 178 | 179 | # Measure the AURROCs when using different numbers of generations to compute our uncertainty measures. 180 | ln_aurocs = [] 181 | aurocs = [] 182 | semantic_aurocs = [] 183 | average_number_of_semantic_sets = [] 184 | average_number_of_semantic_sets_correct = [] 185 | average_number_of_semantic_sets_incorrect = [] 186 | for i in range(1, num_generations + 1): 187 | ln_predictive_entropy_auroc = sklearn.metrics.roc_auc_score( 188 | 1 - result_df['correct'], result_df['average_predictive_entropy_on_subset_{}'.format(i)]) 189 | aurocs.append( 190 | sklearn.metrics.roc_auc_score(1 - result_df['correct'], 191 | result_df['predictive_entropy_on_subset_{}'.format(i)])) 192 | ln_aurocs.append(ln_predictive_entropy_auroc) 193 | semantic_aurocs.append( 194 | sklearn.metrics.roc_auc_score(1 - result_df['correct'], 195 | result_df['semantic_predictive_entropy_on_subset_{}'.format(i)])) 196 | average_number_of_semantic_sets.append(result_df['number_of_semantic_sets_on_subset_{}'.format(i)].mean()) 197 | average_number_of_semantic_sets_correct.append( 198 | result_df[result_df['correct'] == 1]['number_of_semantic_sets_on_subset_{}'.format(i)].mean()) 199 | average_number_of_semantic_sets_incorrect.append( 200 | result_df[result_df['correct'] == 0]['number_of_semantic_sets_on_subset_{}'.format(i)].mean()) 201 | 202 | result_dict['ln_predictive_entropy_auroc_on_subsets'] = ln_aurocs 203 | result_dict['predictive_entropy_auroc_on_subsets'] = aurocs 204 | result_dict['semantic_predictive_entropy_auroc_on_subsets'] = semantic_aurocs 205 | result_dict['average_number_of_semantic_sets_on_subsets'] = average_number_of_semantic_sets 206 | result_dict['average_number_of_semantic_sets_on_subsets_correct'] = average_number_of_semantic_sets_correct 207 | result_dict['average_number_of_semantic_sets_on_subsets_incorrect'] = average_number_of_semantic_sets_incorrect 208 | result_dict['model_name'] = model_name 209 | result_dict['run_name'] = run_name 210 | 211 | wandb.log(result_dict) 212 | 213 | overall_result_dict[run_id] = result_dict 214 | sequence_embeddings_dict[run_id] = sequence_embeddings 215 | 216 | wandb.finish() 217 | torch.cuda.empty_cache() 218 | 219 | with open('overall_results.json', 'w') as f: 220 | json.dump(overall_result_dict, f) 221 | 222 | with open('sequence_embeddings.pkl', 'wb') as f: 223 | pickle.dump(sequence_embeddings_dict, f) 224 | 225 | # Store data frame as csv 226 | accuracy_verification_df = result_df[['most_likely_generation', 'answer', 'correct']] 227 | accuracy_verification_df.to_csv('accuracy_verification.csv') 228 | --------------------------------------------------------------------------------