├── figures
    ├── trivia_qa
    │   ├── 30b_results.pdf
    │   ├── accuracy_versus_model_size.pdf
    │   ├── ln_predictive_entropy_auroc.pdf
    │   ├── temperature_comparisons_trivia_qa.pdf
    │   ├── ln_predictive_entropy_auroc_triviaqa.pdf
    │   └── ln_predictive_entropy_auroc_triviaqa_with_margin.pdf
    ├── auroc_as_function_of_n_samples.pdf
    └── coqa
    │   ├── accuracy_versus_model_size.pdf
    │   ├── temperature_comparisons_coqa.pdf
    │   ├── ln_predictive_entropy_auroc_coqa.pdf
    │   └── ln_predictive_entropy_auroc_coqa_with_margin.pdf
├── code
    ├── run_pipeline.sh
    ├── config.py
    ├── clean_generated_strings.py
    ├── parse_coqa.py
    ├── parse_triviaqa.py
    ├── get_prompting_based_uncertainty.py
    ├── environment.yml
    ├── get_semantic_similarities.py
    ├── get_likelihoods.py
    ├── compute_confidence_measure.py
    ├── generate.py
    └── analyze_results.py
├── LICENSE
├── .gitignore
└── README.md


/figures/trivia_qa/30b_results.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/30b_results.pdf


--------------------------------------------------------------------------------
/figures/auroc_as_function_of_n_samples.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/auroc_as_function_of_n_samples.pdf


--------------------------------------------------------------------------------
/figures/coqa/accuracy_versus_model_size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/accuracy_versus_model_size.pdf


--------------------------------------------------------------------------------
/figures/coqa/temperature_comparisons_coqa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/temperature_comparisons_coqa.pdf


--------------------------------------------------------------------------------
/figures/trivia_qa/accuracy_versus_model_size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/accuracy_versus_model_size.pdf


--------------------------------------------------------------------------------
/figures/coqa/ln_predictive_entropy_auroc_coqa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/ln_predictive_entropy_auroc_coqa.pdf


--------------------------------------------------------------------------------
/figures/trivia_qa/ln_predictive_entropy_auroc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/ln_predictive_entropy_auroc.pdf


--------------------------------------------------------------------------------
/figures/trivia_qa/temperature_comparisons_trivia_qa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/temperature_comparisons_trivia_qa.pdf


--------------------------------------------------------------------------------
/figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa.pdf


--------------------------------------------------------------------------------
/figures/coqa/ln_predictive_entropy_auroc_coqa_with_margin.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/coqa/ln_predictive_entropy_auroc_coqa_with_margin.pdf


--------------------------------------------------------------------------------
/figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa_with_margin.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lorenzkuhn/semantic_uncertainty/HEAD/figures/trivia_qa/ln_predictive_entropy_auroc_triviaqa_with_margin.pdf


--------------------------------------------------------------------------------
/code/run_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --cpus-per-task=24
 3 | #SBATCH --gres=gpu:a100:1
 4 | #SBATCH --job-name="nlg_uncertainty"
 5 | ``
 6 | 
 7 | 
 8 | run_id=`python -c "import wandb; run_id = wandb.util.generate_id(); wandb.init(project='nlg_uncertainty', id=run_id); print(run_id)"`
 9 | 
10 | model='opt-350m'
11 | srun python generate.py --num_generations_per_prompt='5' --model=$model --fraction_of_data_to_use='0.02' --run_id=$run_id --temperature='0.5' --num_beams='1' --top_p='1.0'; srun python clean_generated_strings.py  --generation_model=$model --run_id=$run_id; python get_semantic_similarities.py --generation_model=$model --run_id=$run_id; python get_likelihoods.py --evaluation_model=$model --generation_model=$model --run_id=$run_id; srun python get_prompting_based_uncertainty.py --run_id_for_few_shot_prompt=$run_id --run_id_for_evaluation=$run_id; python compute_confidence_measure.py --generation_model=$model --evaluation_model=$model --run_id=$run_id
12 | 
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Lorenz Kuhn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code/config.py:
--------------------------------------------------------------------------------
 1 | device_map = {
 2 |     'model.decoder.embed_tokens': 0,
 3 |     'model.decoder.embed_positions': 0,
 4 |     'model.decoder.layers.0': 0,
 5 |     'model.decoder.layers.1': 0,
 6 |     'model.decoder.layers.2': 0,
 7 |     'model.decoder.layers.3': 0,
 8 |     'model.decoder.layers.4': 0,
 9 |     'model.decoder.layers.5': 0,
10 |     'model.decoder.layers.6': 0,
11 |     'model.decoder.layers.7': 0,
12 |     'model.decoder.layers.8': 0,
13 |     'model.decoder.layers.9': 0,
14 |     'model.decoder.layers.10': 0,
15 |     'model.decoder.layers.11': 0,
16 |     'model.decoder.layers.12': 0,
17 |     'model.decoder.layers.13': 0,
18 |     'model.decoder.layers.14': 0,
19 |     'model.decoder.layers.15': 0,
20 |     'model.decoder.layers.16': 0,
21 |     'model.decoder.layers.17': 0,
22 |     'model.decoder.layers.18': 0,
23 |     'model.decoder.layers.19': 0,
24 |     'model.decoder.layers.20': 0,
25 |     'model.decoder.layers.21': 0,
26 |     'model.decoder.layers.22': 0,
27 |     'model.decoder.layers.23': 0,
28 |     'model.decoder.layers.24': 0,
29 |     'model.decoder.layers.25': 1,
30 |     'model.decoder.layers.26': 1,
31 |     'model.decoder.layers.27': 1,
32 |     'model.decoder.layers.28': 1,
33 |     'model.decoder.layers.29': 1,
34 |     'model.decoder.layers.30': 1,
35 |     'model.decoder.layers.31': 1,
36 |     'model.decoder.layers.32': 1,
37 |     'model.decoder.layers.33': 1,
38 |     'model.decoder.layers.34': 1,
39 |     'model.decoder.layers.35': 1,
40 |     'model.decoder.layers.36': 1,
41 |     'model.decoder.layers.37': 1,
42 |     'model.decoder.layers.38': 1,
43 |     'model.decoder.layers.39': 1,
44 |     'model.decoder.layers.40': 1,
45 |     'model.decoder.layers.41': 1,
46 |     'model.decoder.layers.42': 1,
47 |     'model.decoder.layers.43': 1,
48 |     'model.decoder.layers.44': 1,
49 |     'model.decoder.layers.45': 1,
50 |     'model.decoder.layers.46': 1,
51 |     'model.decoder.layers.47': 1,
52 |     'model.decoder.layers.48': 1,
53 |     'model.decoder.final_layer_norm': 1,
54 |     'lm_head': 1
55 | }
56 | 
57 | data_dir = ''
58 | hf_datasets_cache = ''
59 | output_dir = ''
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | *.out
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/code/clean_generated_strings.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | import random
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer
10 | 
11 | import config
12 | import wandb
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--generation_model', type=str, default='opt-350m')
16 | parser.add_argument('--run_id', type=str, default='run_1')
17 | args = parser.parse_args()
18 | 
19 | device = 'cuda'
20 | 
21 | # Set a seed value
22 | seed_value = 10
23 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
24 | 
25 | os.environ['PYTHONHASHSEED'] = str(seed_value)
26 | # 2. Set `python` built-in pseudo-random generator at a fixed value
27 | 
28 | random.seed(seed_value)
29 | # 3. Set `numpy` pseudo-random generator at a fixed value
30 | 
31 | np.random.seed(seed_value)
32 | 
33 | #Fix torch random seed
34 | torch.manual_seed(seed_value)
35 | 
36 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache
37 | 
38 | generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir)
39 | 
40 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow')
41 | 
42 | run_name = wandb.run.name
43 | 
44 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir)
45 | 
46 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'rb') as infile:
47 |     sequences = pickle.load(infile)
48 | 
49 | cleaned_sequences = []
50 | 
51 | for sample in tqdm(sequences):
52 |     cleaned_generations = torch.ones_like(sample['generations'])
53 |     question = sample['question']
54 |     generated_texts = sample['generated_texts']
55 |     cleaned_generated_texts = []
56 | 
57 |     max_len_of_generations = cleaned_generations.shape[-1]
58 | 
59 |     strings_to_filter_on = [
60 |         '.', '\n', 'Q:', 'A:', 'question:', 'answer:', 'Question:', 'Answer:', 'Questions:', 'questions:', 'QUESTION:',
61 |         'ANSWER:'
62 |     ]
63 | 
64 |     for i, generated_text in enumerate(generated_texts):
65 |         for string in strings_to_filter_on:
66 |             if string in generated_text:
67 |                 generated_text = generated_text.split(string)[0]
68 |         cleaned_generated_texts.append(generated_text)
69 |         clean_ids = torch.cat(
70 |             [sample['prompt'].to(device),
71 |              torch.tensor(tokenizer(generated_text)['input_ids'][1:], device=device)])
72 |         cleaned_generations[i, :min(len(clean_ids), max_len_of_generations)] = clean_ids[:max_len_of_generations]
73 | 
74 |     sample['cleaned_generated_texts'] = cleaned_generated_texts
75 |     sample['cleaned_generations'] = cleaned_generations
76 |     cleaned_sequences.append(sample)
77 | 
78 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'wb') as outfile:
79 |     pickle.dump(cleaned_sequences, outfile)
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ***
 2 | 
 3 | # June 2024 — [A new and improved implementation of Semantic Uncertainty is available]([url](https://github.com/jlko/semantic_uncertainty)), this repo is deprecated
 4 | We're excited to share a [new implementation of semantic uncertainty](https://github.com/jlko/semantic_uncertainty) which corresponds to our [2024 Nature paper](https://www.nature.com/articles/s41586-024-07421-0) _Detecting Hallucinations in Large Language Models Using Semantic Entropy_. Please use the new and improved version, we are deprecating this repository. Thank you for your interest!
 5 | 
 6 | This repository contains the code for our 2023 ICLR paper _Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation_.
 7 | 
 8 | ***
 9 | 
10 | 
11 | ### Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation
12 | 
13 | ![image](https://user-images.githubusercontent.com/9898136/223775961-7f9525fc-9674-4bf4-b15f-d49487daddca.png)
14 | 
15 | # Overview
16 | 
17 | This repository contains the code used in Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation ([arXiv](https://arxiv.org/abs/2302.09664))
18 | 
19 | `run_pipeline.sh` is a slurm batch script that executes all steps of our pipeline.   `sbatch run_pipeline.sh` submits the batch script.
20 | 
21 | ### Preprocessing & Config
22 | 
23 | `parse_triviaqa.py` and `parse_coqa.py`  load TriviaQA and CoQA from HuggingFace, tokenize it and store the data sets. These scripts only have to be run once. 
24 | 
25 | You'll also have to set the paths where you would like to store intermediate and final results of the pipeline in `config.py`.
26 | 
27 | The `environment.yml` lists the dependencies of the conda environment we used for our experiments.
28 | 
29 | ### Generating answers and computing uncertainty measures
30 | 
31 | The components of our pipeline are:
32 | 
33 | * `generate.py` generates a number of answers for a subset of questions of a given data set. This step also evaluates the question-answering accuracy of the generated answers.
34 | * `clean_generations.py` post-processes the generations from the first step, mainly by removing any unwanted trailing text, e.g. in cases where the model first gives the answer to the given question and then generates an additional question.
35 | * `get_semantic_similarities.py` identifies semantic clusters in the generated set of answers from the previous step.
36 | * `get_prompting_based_uncertainty.py` computes the p(True) baseline.
37 | * `compute_likelihoods.py` computes the likelihoods of the generated answers under the generating model.
38 | * `compute_confidence_measure.py` computes a range of different conficence/uncertainty measures such as the semantice entropy predictive entropy, lexical similarity, and p(True).
39 | 
40 | ### Analyzing results
41 | 
42 | After running the pipeline, use `analyze_result.py` to compute performance metrics, such as the AUROC.
43 | 
44 | ### Hardware requirements
45 | 
46 | Most model runs should run with at most 40GB of GPU memory. An exception are the experiments on OPT-30B which we run on two 80GB A100s.
47 | 
48 | ### Dependencies
49 | 
50 | Our implemenetation uses PyTorch and HuggingFace. We use `wandb` to track our runs. environment
51 | 


--------------------------------------------------------------------------------
/code/parse_coqa.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import evaluate
 4 | import pandas as pd
 5 | import torch
 6 | from datasets import Dataset
 7 | from transformers import AutoModelForSequenceClassification, AutoTokenizer
 8 | 
 9 | import config
10 | 
11 | with open(f'{config.data_dir}/coqa-dev-v1.0.json', 'r') as infile:
12 |     data = json.load(infile)['data']
13 | 
14 | rouge = evaluate.load('rouge')
15 | 
16 | tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
17 | 
18 | model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large-mnli").cuda()
19 | 
20 | dataset = {}
21 | 
22 | dataset['story'] = []
23 | dataset['question'] = []
24 | dataset['answer'] = []
25 | dataset['additional_answers'] = []
26 | dataset['rouge1'] = []
27 | dataset['rouge2'] = []
28 | dataset['rougeL'] = []
29 | dataset['semantic_variability'] = []
30 | dataset['id'] = []
31 | 
32 | for sample_id, sample in enumerate(data):
33 |     story = sample['story']
34 |     questions = sample['questions']
35 |     answers = sample['answers']
36 |     additional_answers = sample['additional_answers']
37 |     for question_index, question in enumerate(questions):
38 |         dataset['story'].append(story)
39 |         dataset['question'].append(question['input_text'])
40 |         dataset['answer'].append({
41 |             'text': answers[question_index]['input_text'],
42 |             'answer_start': answers[question_index]['span_start']
43 |         })
44 |         dataset['id'].append(sample['id'] + '_' + str(question_index))
45 |         additional_answers_list = []
46 | 
47 |         for i in range(3):
48 |             additional_answers_list.append(additional_answers[str(i)][question_index]['input_text'])
49 | 
50 |         dataset['additional_answers'].append(additional_answers_list)
51 |         story = story + ' Q: ' + question['input_text'] + ' A: ' + answers[question_index]['input_text']
52 |         if not story[-1] == '.':
53 |             story = story + '.'
54 |         all_answers = [answers[question_index]['input_text']] + additional_answers_list
55 | 
56 |         answer_list_1 = []
57 |         answer_list_2 = []
58 |         has_semantically_different_answers = False
59 |         inputs = []
60 | 
61 |         # This computes the syntactic similarity across the reference answers
62 |         for i, reference_answer in enumerate(all_answers):
63 |             for j in range(4):
64 |                 if i != j:
65 |                     answer_list_1.append(all_answers[i])
66 |                     answer_list_2.append(all_answers[j])
67 | 
68 |                     qa_1 = question['input_text'] + ' ' + all_answers[i]
69 |                     qa_2 = question['input_text'] + ' ' + all_answers[j]
70 | 
71 |                     input = qa_1 + ' [SEP] ' + qa_2
72 | 
73 |                     inputs.append(input)
74 |                     #print(encoded_input)
75 | 
76 |         encoded_input = tokenizer.batch_encode_plus(inputs, padding=True)
77 | 
78 |         prediction = model(torch.tensor(encoded_input['input_ids'], device='cuda'))['logits']
79 | 
80 |         predicted_label = torch.argmax(prediction, dim=1)
81 |         if 0 in predicted_label:
82 |             has_semantically_different_answers = True
83 | 
84 |         dataset['semantic_variability'].append(has_semantically_different_answers)
85 | 
86 |         results = rouge.compute(predictions=answer_list_1, references=answer_list_2)
87 |         dataset['rouge1'].append(results['rouge1'].mid.fmeasure)
88 |         dataset['rouge2'].append(results['rouge2'].mid.fmeasure)
89 |         dataset['rougeL'].append(results['rougeL'].mid.fmeasure)
90 | 
91 | dataset_df = pd.DataFrame.from_dict(dataset)
92 | 
93 | dataset = Dataset.from_pandas(dataset_df)
94 | 
95 | dataset.save_to_disk(f'{config.data_dir}/coqa_dataset')
96 | 


--------------------------------------------------------------------------------
/code/parse_triviaqa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pathlib
 3 | import pickle
 4 | 
 5 | import accelerate
 6 | import datasets
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | import config
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--type_of_question', type=str)
14 | parser.add_argument('--num_generations_per_prompt', type=int, default=5)
15 | parser.add_argument('--fraction_of_data_to_use', type=float, default=0.9)
16 | parser.add_argument('--model', type=str, default='opt-350m')
17 | parser.add_argument('--run_id', type=str, default='run_1')
18 | parser.add_argument('--temperature', type=float, default='1.0')
19 | parser.add_argument('--num_beams', type=int, default='5')
20 | parser.add_argument('--decoding_method', type=str, default='beam_search')
21 | parser.add_argument('--top_p', type=float, default=1.0)
22 | args = parser.parse_args()
23 | 
24 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{args.model}",
25 |                                              torch_dtype=torch.float16,
26 |                                              cache_dir=config.data_dir).cuda()
27 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir)
28 | 
29 | if args.model == 'opt-30b':
30 |     accelerate.dispatch_model(model, device_map=config.device_map)
31 | 
32 | seed_value = 10
33 | 
34 | if not pathlib.Path(f'{config.data_dir}/trivia_qa').exists():
35 | 
36 |     print('Preprocessing dataset')
37 |     val_data = datasets.load_dataset("trivia_qa", "rc.nocontext", split="validation")
38 |     train_data = datasets.load_dataset("trivia_qa", "rc.nocontext", split="train")
39 |     data_for_few_shot_prompt = train_data.select(range(0, 10))
40 | 
41 |     few_shot_prompt = 'This is a bot that correctly answers questions. \n'
42 |     for sample in data_for_few_shot_prompt:
43 |         few_shot_prompt += 'Question: ' + sample['question'] + ' Answer: ' + sample['answer']['value'] + ' '
44 | 
45 |     batch_size = 4  # change to 16 for full training
46 |     encoder_max_length = 1024
47 |     decoder_max_length = 128
48 | 
49 |     def process_data_to_model_inputs(batch):
50 |         # tokenize the inputs and labels
51 |         answers = [answer["value"] for answer in batch["answer"]]
52 | 
53 |         batch_with_prompt = [few_shot_prompt + "Question: " + question + " Answer:" for question in batch["question"]]
54 |         inputs = tokenizer(batch_with_prompt, padding=False, truncation=False)
55 |         outputs = tokenizer(answers, padding=False, truncation=False)
56 | 
57 |         batch["input_ids"] = inputs.input_ids
58 |         batch["attention_mask"] = inputs.attention_mask
59 |         batch["decoder_input_ids"] = outputs.input_ids
60 |         batch["decoder_attention_mask"] = outputs.attention_mask
61 |         batch["labels"] = outputs.input_ids.copy()
62 |         batch['answer'] = answers
63 | 
64 |         # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
65 |         # We have to make sure that the PAD token is ignored
66 |         batch["labels"] = [
67 |             [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
68 |         ]
69 | 
70 |         return batch
71 | 
72 |     val_data = val_data.map(process_data_to_model_inputs,
73 |                             batched=True,
74 |                             batch_size=batch_size,
75 |                             remove_columns=["search_results", "question_source", "entity_pages"])
76 |     val_data.set_format(
77 |         type="torch",
78 |         columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
79 |         output_all_columns=True)
80 | 
81 |     val_data.save_to_disk(f'{config.data_dir}/trivia_qa')
82 | else:
83 | 
84 |     val_data = datasets.load_from_disk(f'{config.data_dir}/trivia_qa')
85 | 


--------------------------------------------------------------------------------
/code/get_prompting_based_uncertainty.py:
--------------------------------------------------------------------------------
  1 | # Read generation results
  2 | import argparse
  3 | import os
  4 | import pickle
  5 | import random
  6 | 
  7 | import accelerate
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | import seaborn as sns
 11 | import sklearn
 12 | import torch
 13 | from tqdm import tqdm
 14 | from transformers import AutoModelForCausalLM, AutoTokenizer
 15 | 
 16 | import config
 17 | #sns.color_palette("pastel")
 18 | import wandb
 19 | from config import device_map
 20 | 
 21 | # Set a seed value
 22 | seed_value = 10
 23 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
 24 | 
 25 | os.environ['PYTHONHASHSEED'] = str(seed_value)
 26 | # 2. Set `python` built-in pseudo-random generator at a fixed value
 27 | 
 28 | random.seed(seed_value)
 29 | # 3. Set `numpy` pseudo-random generator at a fixed value
 30 | 
 31 | np.random.seed(seed_value)
 32 | 
 33 | device = torch.device('cuda')
 34 | 
 35 | #Fix torch random seed
 36 | torch.manual_seed(seed_value)
 37 | 
 38 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache
 39 | 
 40 | parser = argparse.ArgumentParser()
 41 | parser.add_argument('--generation_model', type=str, default='opt-1.3b')
 42 | parser.add_argument('--run_id_for_few_shot_prompt', type=str, default='run_1')
 43 | parser.add_argument('--run_id_for_evaluation', type=str, default='run_1')
 44 | args = parser.parse_args()
 45 | 
 46 | wandb.init(project='nlg_uncertainty', id=args.run_id_for_few_shot_prompt, config=args, resume='allow')
 47 | model_name = wandb.config.model
 48 | 
 49 | generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir)
 50 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{model_name}",
 51 |                                              torch_dtype=torch.float16,
 52 |                                              cache_dir=config.data_dir).cuda()
 53 | 
 54 | if model_name == 'opt-30b':
 55 |     accelerate.dispatch_model(model, device_map=device_map)
 56 |     print(model.hf_device_map)
 57 |     device = torch.device('cuda:1')
 58 | 
 59 | run_name = wandb.run.name
 60 | 
 61 | with open(f'{config.output_dir} /{run_name}/{model_name}_generations.pkl', 'rb') as infile:
 62 |     sequences_for_few_shot_prompt = pickle.load(infile)
 63 | 
 64 | wandb.finish()
 65 | 
 66 | # Build few shot prompt
 67 | 
 68 | subset_of_sequences_for_few_shot_prompt = sequences_for_few_shot_prompt[-10:]
 69 | number_of_few_shot_samples = 5
 70 | 
 71 | prompt_template = 'Question: {} \n Here are some ideas that were brainstormed:{}\n Possible answer:{}\n Is the possible answer:\n (A) True\n (B) False\n The possible answer is:'
 72 | few_shot_promopt = ''
 73 | for sequence in subset_of_sequences_for_few_shot_prompt:
 74 |     question = sequence['question']
 75 |     question = question.split('Question: ')[-1].split('Answer: ')[0]
 76 |     prompt = sequence['prompt']
 77 |     generated_texts = '\n'.join(sequence['cleaned_generated_texts'][:number_of_few_shot_samples])
 78 | 
 79 |     most_likely_answer = sequence['most_likely_generation']
 80 |     correct = ' True' if sequence['rougeL_to_target'] > 0.3 else ' False'
 81 |     few_shot_promopt += prompt_template.format(question, generated_texts, most_likely_answer) + correct + '\n'
 82 | 
 83 | # Build prompt for question
 84 | labels_across_datasets = []
 85 | p_trues_across_datasets = []
 86 | 
 87 | n_samples_to_use = 2000
 88 | 
 89 | with torch.no_grad():
 90 | 
 91 |     aurocs = []
 92 |     p_trues = []
 93 |     corrects = []
 94 |     for sequence in tqdm(sequences_for_few_shot_prompt[:n_samples_to_use]):
 95 | 
 96 |         question = sequence['question']
 97 |         if 'Question: ' in question:
 98 |             question = question.split('Question: ')[-1].split('Answer: ')[0]
 99 |         else:
100 |             question = question.split('Q: ')[-1].split('A: ')[0]
101 | 
102 |         generated_texts = '\n'.join(sequence['cleaned_generated_texts'][:number_of_few_shot_samples])
103 |         most_likely_answer = sequence['most_likely_generation']
104 |         correct = 1.0 if sequence['rougeL_to_target'] > 0.3 else 0.0
105 |         base_prompt = prompt_template.format(question, generated_texts, most_likely_answer)
106 |         prompt_true = few_shot_promopt + prompt_template.format(question, generated_texts, most_likely_answer) + ' True'
107 | 
108 |         # This computation of the negative log likelihoods follows this tutorial: https://huggingface.co/docs/transformers/perplexity
109 |         tokenized_base_prompt = generation_tokenizer(base_prompt)['input_ids']
110 |         tokenized_prompt_true = torch.tensor(generation_tokenizer(prompt_true)['input_ids'], device=device)
111 | 
112 |         target_ids_true = tokenized_prompt_true.clone()
113 |         target_ids_true[:len(tokenized_base_prompt)] = -100
114 | 
115 |         model_output_true = model(torch.reshape(tokenized_prompt_true, (1, -1)), labels=target_ids_true)
116 |         loss_true = model_output_true.loss
117 | 
118 |         p_trues.append(loss_true.item())
119 |         corrects.append(correct)
120 | 
121 |         labels_across_datasets += corrects
122 |         p_trues_across_datasets += p_trues
123 | 
124 |     p_true_auroc = sklearn.metrics.roc_auc_score(1 - torch.tensor(corrects), torch.tensor(p_trues))
125 | 
126 |     # Store p_true aurocs in a pickle file
127 |     with open(f'{config.output_dir}/{run_name}/{model_name}_p_true_aurocs.pkl', 'wb') as outfile:
128 |         pickle.dump(p_true_auroc, outfile)
129 | 


--------------------------------------------------------------------------------
/code/environment.yml:
--------------------------------------------------------------------------------
  1 | name: unanswerable
  2 | channels:
  3 |   - pytorch
  4 |   - defaults
  5 |   - conda-forge
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=5.1=1_gnu
  9 |   - blas=1.0=mkl
 10 |   - brotlipy=0.7.0=py310h7f8727e_1002
 11 |   - bzip2=1.0.8=h7b6447c_0
 12 |   - ca-certificates=2022.4.26=h06a4308_0
 13 |   - cffi=1.15.0=py310hd667e15_1
 14 |   - cryptography=37.0.1=py310h9ce1e76_0
 15 |   - cudatoolkit=11.3.1=h2bc3f7f_2
 16 |   - debugpy=1.5.1=py310h295c915_0
 17 |   - entrypoints=0.4=py310h06a4308_0
 18 |   - ffmpeg=4.3=hf484d3e_0
 19 |   - freetype=2.11.0=h70c0345_0
 20 |   - giflib=5.2.1=h7b6447c_0
 21 |   - gmp=6.2.1=h295c915_3
 22 |   - gnutls=3.6.15=he1e5248_0
 23 |   - intel-openmp=2021.4.0=h06a4308_3561
 24 |   - ipykernel=6.9.1=py310h06a4308_0
 25 |   - ipython=8.3.0=py310h06a4308_0
 26 |   - jedi=0.18.1=py310h06a4308_1
 27 |   - jpeg=9e=h7f8727e_0
 28 |   - jupyter_client=7.2.2=py310h06a4308_0
 29 |   - jupyter_core=4.10.0=py310h06a4308_0
 30 |   - lame=3.100=h7b6447c_0
 31 |   - lcms2=2.12=h3be6417_0
 32 |   - ld_impl_linux-64=2.38=h1181459_1
 33 |   - libffi=3.3=he6710b0_2
 34 |   - libgcc-ng=11.2.0=h1234567_1
 35 |   - libgomp=11.2.0=h1234567_1
 36 |   - libiconv=1.16=h7f8727e_2
 37 |   - libidn2=2.3.2=h7f8727e_0
 38 |   - libpng=1.6.37=hbc83047_0
 39 |   - libsodium=1.0.18=h7b6447c_0
 40 |   - libstdcxx-ng=11.2.0=h1234567_1
 41 |   - libtasn1=4.16.0=h27cfd23_0
 42 |   - libtiff=4.2.0=h2818925_1
 43 |   - libunistring=0.9.10=h27cfd23_0
 44 |   - libuuid=1.0.3=h7f8727e_2
 45 |   - libuv=1.40.0=h7b6447c_0
 46 |   - libwebp=1.2.2=h55f646e_0
 47 |   - libwebp-base=1.2.2=h7f8727e_0
 48 |   - lz4-c=1.9.3=h295c915_1
 49 |   - mkl=2021.4.0=h06a4308_640
 50 |   - mkl-service=2.4.0=py310h7f8727e_0
 51 |   - mkl_fft=1.3.1=py310hd6ae3a3_0
 52 |   - mkl_random=1.2.2=py310h00e6091_0
 53 |   - ncurses=6.3=h7f8727e_2
 54 |   - nest-asyncio=1.5.5=py310h06a4308_0
 55 |   - nettle=3.7.3=hbbd107a_1
 56 |   - numpy=1.22.3=py310hfa59a62_0
 57 |   - numpy-base=1.22.3=py310h9585f30_0
 58 |   - openh264=2.1.1=h4ff587b_0
 59 |   - openssl=1.1.1o=h7f8727e_0
 60 |   - pillow=9.0.1=py310h22f2fdc_0
 61 |   - pure_eval=0.2.2=pyhd3eb1b0_0
 62 |   - pysocks=1.7.1=py310h06a4308_0
 63 |   - python=3.10.4=h12debd9_0
 64 |   - pytorch=1.11.0=py3.10_cuda11.3_cudnn8.2.0_0
 65 |   - pytorch-mutex=1.0=cuda
 66 |   - pyzmq=22.3.0=py310h295c915_2
 67 |   - readline=8.1.2=h7f8727e_1
 68 |   - setuptools=61.2.0=py310h06a4308_0
 69 |   - sqlite=3.38.5=hc218d9a_0
 70 |   - stack_data=0.2.0=pyhd3eb1b0_0
 71 |   - tk=8.6.12=h1ccaba5_0
 72 |   - torchaudio=0.11.0=py310_cu113
 73 |   - torchvision=0.12.0=py310_cu113
 74 |   - tornado=6.1=py310h7f8727e_0
 75 |   - typing_extensions=4.1.1=pyh06a4308_0
 76 |   - tzdata=2022a=hda174b7_0
 77 |   - urllib3=1.26.9=py310h06a4308_0
 78 |   - xz=5.2.5=h7f8727e_1
 79 |   - zeromq=4.3.4=h2531618_0
 80 |   - zlib=1.2.12=h7f8727e_2
 81 |   - zstd=1.5.2=ha4553b6_0
 82 |   - pip:
 83 |     - absl-py==1.2.0
 84 |     - accelerate==0.12.0
 85 |     - aiohttp==3.8.1
 86 |     - aiosignal==1.2.0
 87 |     - asttokens==2.0.5
 88 |     - async-timeout==4.0.2
 89 |     - attrs==21.4.0
 90 |     - backcall==0.2.0
 91 |     - bleach==5.0.1
 92 |     - certifi==2022.6.15
 93 |     - charset-normalizer==2.0.4
 94 |     - click==8.1.3
 95 |     - cloudpickle==2.2.0
 96 |     - cycler==0.11.0
 97 |     - dask==2022.10.2
 98 |     - datasets==2.3.2
 99 |     - decorator==5.1.1
100 |     - dill==0.3.5.1
101 |     - docker-pycreds==0.4.0
102 |     - et-xmlfile==1.1.0
103 |     - evaluate==0.1.2
104 |     - executing==0.8.3
105 |     - filelock==3.7.1
106 |     - fonttools==4.34.0
107 |     - frozenlist==1.3.0
108 |     - fsspec==2022.5.0
109 |     - gitdb==4.0.9
110 |     - gitpython==3.1.27
111 |     - huggingface-hub==0.8.1
112 |     - idna==3.3
113 |     - ipywidgets==8.0.2
114 |     - joblib==1.1.0
115 |     - jupyterlab-widgets==3.0.3
116 |     - kiwisolver==1.4.3
117 |     - locket==1.0.0
118 |     - matplotlib==3.5.2
119 |     - matplotlib-inline==0.1.2
120 |     - multidict==6.0.2
121 |     - multiprocess==0.70.13
122 |     - nltk==3.7
123 |     - openai==0.24.0
124 |     - openpyxl==3.0.10
125 |     - packaging==21.3
126 |     - pandas==1.4.3
127 |     - pandas-stubs==1.5.1.221024
128 |     - parso==0.8.3
129 |     - partd==1.3.0
130 |     - pathtools==0.1.2
131 |     - pexpect==4.8.0
132 |     - pickleshare==0.7.5
133 |     - pip==21.2.4
134 |     - promise==2.3
135 |     - prompt-toolkit==3.0.20
136 |     - protobuf==3.20.1
137 |     - psutil==5.9.1
138 |     - ptyprocess==0.7.0
139 |     - pure-eval==0.2.2
140 |     - pyarrow==8.0.0
141 |     - pycparser==2.21
142 |     - pygments==2.11.2
143 |     - pyopenssl==22.0.0
144 |     - pyparsing==3.0.9
145 |     - python-dateutil==2.8.2
146 |     - pytz==2022.1
147 |     - pyyaml==6.0
148 |     - regex==2022.6.2
149 |     - requests==2.27.1
150 |     - responses==0.18.0
151 |     - rouge-score==0.0.4
152 |     - scikit-learn==1.1.1
153 |     - scipy==1.8.1
154 |     - seaborn==0.11.2
155 |     - sentencepiece==0.1.96
156 |     - sentry-sdk==1.9.7
157 |     - setproctitle==1.3.2
158 |     - shortuuid==1.0.9
159 |     - six==1.16.0
160 |     - sklearn==0.0
161 |     - smmap==5.0.0
162 |     - stack-data==0.2.0
163 |     - swifter==1.3.4
164 |     - threadpoolctl==3.1.0
165 |     - tokenizers==0.12.1
166 |     - toolz==0.12.0
167 |     - torchmetrics==0.9.2
168 |     - tqdm==4.64.0
169 |     - traitlets==5.1.1
170 |     - transformers==4.20.1
171 |     - types-pytz==2022.5.0.0
172 |     - typing-extensions==4.1.1
173 |     - wandb==0.13.2
174 |     - wcwidth==0.2.5
175 |     - webencodings==0.5.1
176 |     - wheel==0.37.1
177 |     - widgetsnbextension==4.0.3
178 |     - xxhash==3.0.0
179 |     - yarl==1.7.2
180 | prefix: /users/loruhn/.conda/envs/unanswerable
181 | 


--------------------------------------------------------------------------------
/code/get_semantic_similarities.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import os
  4 | import pickle
  5 | import random
  6 | 
  7 | import evaluate
  8 | import numpy as np
  9 | import torch
 10 | from tqdm import tqdm
 11 | from transformers import AutoModelForSequenceClassification, AutoTokenizer
 12 | 
 13 | import config
 14 | import wandb
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('--generation_model', type=str, default='opt-350m')
 18 | parser.add_argument('--run_id', type=str, default='run_1')
 19 | args = parser.parse_args()
 20 | 
 21 | device = 'cuda'
 22 | 
 23 | # Set a seed value
 24 | seed_value = 10
 25 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
 26 | 
 27 | os.environ['PYTHONHASHSEED'] = str(seed_value)
 28 | # 2. Set `python` built-in pseudo-random generator at a fixed value
 29 | 
 30 | random.seed(seed_value)
 31 | # 3. Set `numpy` pseudo-random generator at a fixed value
 32 | 
 33 | np.random.seed(seed_value)
 34 | 
 35 | #Fix torch random seed
 36 | torch.manual_seed(seed_value)
 37 | 
 38 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache
 39 | 
 40 | generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-350m", use_fast=False, cache_dir=config.data_dir)
 41 | 
 42 | tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
 43 | model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large-mnli").cuda()
 44 | 
 45 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow')
 46 | 
 47 | run_name = wandb.run.name
 48 | 
 49 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'rb') as infile:
 50 |     sequences = pickle.load(infile)
 51 | 
 52 | result_dict = {}
 53 | 
 54 | meteor = evaluate.load('meteor')
 55 | 
 56 | deberta_predictions = []
 57 | 
 58 | for sample in tqdm(sequences):
 59 |     question = sample['question']
 60 |     if 'cleaned_generated_texts' in sample:
 61 |         generated_texts = sample['cleaned_generated_texts']
 62 |     else:
 63 |         generated_texts = sample['generated_texts']
 64 | 
 65 |     id_ = sample['id'][0]
 66 | 
 67 |     unique_generated_texts = list(set(generated_texts))
 68 | 
 69 |     answer_list_1 = []
 70 |     answer_list_2 = []
 71 |     has_semantically_different_answers = False
 72 |     inputs = []
 73 |     syntactic_similarities = {}
 74 |     rouge_types = ['rouge1', 'rouge2', 'rougeL']
 75 |     for rouge_type in rouge_types:
 76 |         syntactic_similarities[rouge_type] = 0.0
 77 | 
 78 |     semantic_set_ids = {}
 79 |     for index, answer in enumerate(unique_generated_texts):
 80 |         semantic_set_ids[answer] = index
 81 | 
 82 |     print('Number of unique answers:', len(unique_generated_texts))
 83 | 
 84 |     if len(unique_generated_texts) > 1:
 85 | 
 86 |         # Evalauate semantic similarity
 87 |         for i, reference_answer in enumerate(unique_generated_texts):
 88 |             for j in range(i + 1, len(unique_generated_texts)):
 89 | 
 90 |                 answer_list_1.append(unique_generated_texts[i])
 91 |                 answer_list_2.append(unique_generated_texts[j])
 92 | 
 93 |                 qa_1 = question + ' ' + unique_generated_texts[i]
 94 |                 qa_2 = question + ' ' + unique_generated_texts[j]
 95 | 
 96 |                 input = qa_1 + ' [SEP] ' + qa_2
 97 |                 inputs.append(input)
 98 |                 encoded_input = tokenizer.encode(input, padding=True)
 99 |                 prediction = model(torch.tensor(torch.tensor([encoded_input]), device='cuda'))['logits']
100 |                 predicted_label = torch.argmax(prediction, dim=1)
101 | 
102 |                 reverse_input = qa_2 + ' [SEP] ' + qa_1
103 |                 encoded_reverse_input = tokenizer.encode(reverse_input, padding=True)
104 |                 reverse_prediction = model(torch.tensor(torch.tensor([encoded_reverse_input]), device='cuda'))['logits']
105 |                 reverse_predicted_label = torch.argmax(reverse_prediction, dim=1)
106 | 
107 |                 deberta_prediction = 1
108 |                 print(qa_1, qa_2, predicted_label, reverse_predicted_label)
109 |                 if 0 in predicted_label or 0 in reverse_predicted_label:
110 |                     has_semantically_different_answers = True
111 |                     deberta_prediction = 0
112 | 
113 |                 else:
114 |                     semantic_set_ids[unique_generated_texts[j]] = semantic_set_ids[unique_generated_texts[i]]
115 | 
116 |                 deberta_predictions.append([unique_generated_texts[i], unique_generated_texts[j], deberta_prediction])
117 | 
118 |         rouge = evaluate.load('rouge')
119 | 
120 |         # Evalauate syntactic similarity
121 |         answer_list_1 = []
122 |         answer_list_2 = []
123 |         for i in generated_texts:
124 |             for j in generated_texts:
125 |                 if i != j:
126 |                     answer_list_1.append(i)
127 |                     answer_list_2.append(j)
128 | 
129 |         results = rouge.compute(predictions=answer_list_1, references=answer_list_2)
130 | 
131 |         for rouge_type in rouge_types:
132 |             syntactic_similarities[rouge_type] = results[rouge_type].mid.fmeasure
133 | 
134 |     result_dict[id_] = {
135 |         'syntactic_similarities': syntactic_similarities,
136 |         'has_semantically_different_answers': has_semantically_different_answers
137 |     }
138 |     list_of_semantic_set_ids = [semantic_set_ids[x] for x in generated_texts]
139 |     result_dict[id_]['semantic_set_ids'] = list_of_semantic_set_ids
140 | 
141 | with open('deberta_predictions_{}.csv'.format(args.run_id), 'w', encoding='UTF8', newline='') as f:
142 |     writer = csv.writer(f)
143 |     # write the header
144 |     writer.writerow(['qa_1', 'qa_2', 'prediction'])
145 |     writer.writerows(deberta_predictions)
146 | 
147 | print(result_dict)
148 | 
149 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations_similarities.pkl', 'wb') as outfile:
150 |     pickle.dump(result_dict, outfile)
151 | 


--------------------------------------------------------------------------------
/code/get_likelihoods.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pickle
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from transformers import AutoModelForCausalLM, AutoTokenizer
  9 | 
 10 | import wandb
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--evaluation_model', type=str, default='opt-350m')
 14 | parser.add_argument('--generation_model', type=str, default='opt-350m')
 15 | parser.add_argument('--run_id', type=str, default='run_1')
 16 | args = parser.parse_args()
 17 | 
 18 | device = 'cuda'
 19 | import config
 20 | 
 21 | # Set a seed value
 22 | seed_value = 10
 23 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
 24 | 
 25 | os.environ['PYTHONHASHSEED'] = str(seed_value)
 26 | # 2. Set `python` built-in pseudo-random generator at a fixed value
 27 | 
 28 | random.seed(seed_value)
 29 | # 3. Set `numpy` pseudo-random generator at a fixed value
 30 | 
 31 | np.random.seed(seed_value)
 32 | 
 33 | #Fix torch random seed
 34 | torch.manual_seed(seed_value)
 35 | 
 36 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache
 37 | 
 38 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{args.evaluation_model}",
 39 |                                              torch_dtype=torch.float16,
 40 |                                              cache_dir=config.data_dir).cuda()
 41 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/{args.evaluation_model}",
 42 |                                           use_fast=False,
 43 |                                           cache_dir=config.data_dir)
 44 | 
 45 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow')
 46 | 
 47 | run_name = wandb.run.name
 48 | 
 49 | opt_models = ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b', 'opt-6.7b', 'opt-13b', 'opt-30b']
 50 | 
 51 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations.pkl', 'rb') as infile:
 52 |     sequences = pickle.load(infile)
 53 | 
 54 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations_similarities.pkl', 'rb') as infile:
 55 |     similarities_dict = pickle.load(infile)
 56 | 
 57 | 
 58 | def get_neg_loglikelihoods(model, sequences):
 59 | 
 60 |     with torch.no_grad():
 61 |         result = []
 62 |         for sample in sequences:
 63 |             result_dict = {}
 64 |             prompt = sample['prompt']
 65 |             if 'cleaned_generations' in sample:
 66 |                 generations = sample['cleaned_generations'].to(device)
 67 |             else:
 68 |                 generations = sample['generations'].to(device)
 69 |             id_ = sample['id']
 70 | 
 71 |             average_neg_log_likelihoods = torch.zeros((generations.shape[0],))
 72 |             average_unconditioned_neg_log_likelihoods = torch.zeros((generations.shape[0],))
 73 |             neg_log_likelihoods = torch.zeros((generations.shape[0],))
 74 |             neg_unconditioned_log_likelihoods = torch.zeros((generations.shape[0],))
 75 |             pointwise_mutual_information = torch.zeros((generations.shape[0],))
 76 |             sequence_embeddings = []
 77 | 
 78 |             for generation_index in range(generations.shape[0]):
 79 |                 prompt = prompt[prompt != tokenizer.pad_token_id]
 80 |                 generation = generations[generation_index][generations[generation_index] != tokenizer.pad_token_id]
 81 | 
 82 |                 # This computation of the negative log likelihoods follows this tutorial: https://huggingface.co/docs/transformers/perplexity
 83 |                 target_ids = generation.clone()
 84 |                 target_ids[:len(prompt)] = -100
 85 |                 model_output = model(torch.reshape(generation, (1, -1)), labels=target_ids, output_hidden_states=True)
 86 |                 generation_only = generation.clone()[(len(prompt) - 1):]
 87 |                 unconditioned_model_output = model(torch.reshape(generation_only, (1, -1)),
 88 |                                                    labels=generation_only,
 89 |                                                    output_hidden_states=True)
 90 |                 hidden_states = model_output['hidden_states']
 91 |                 average_neg_log_likelihood = model_output['loss']
 92 | 
 93 |                 average_unconditioned_neg_log_likelihood = unconditioned_model_output['loss']
 94 |                 average_neg_log_likelihoods[generation_index] = average_neg_log_likelihood
 95 |                 average_unconditioned_neg_log_likelihoods[generation_index] = average_unconditioned_neg_log_likelihood
 96 |                 neg_log_likelihoods[generation_index] = average_neg_log_likelihood * (len(generation) - len(prompt))
 97 |                 neg_unconditioned_log_likelihoods[generation_index] = average_unconditioned_neg_log_likelihood * (
 98 |                     len(generation) - len(prompt))
 99 |                 pointwise_mutual_information[generation_index] = -neg_log_likelihoods[
100 |                     generation_index] + neg_unconditioned_log_likelihoods[generation_index]
101 | 
102 |                 average_of_last_layer_token_embeddings = torch.mean(hidden_states[-1], dim=1)
103 |                 sequence_embeddings.append(average_of_last_layer_token_embeddings)
104 | 
105 |             most_likely_generation = sample['most_likely_generation_ids'].to(device)
106 |             target_ids = most_likely_generation.clone()
107 |             target_ids[:len(prompt)] = -100
108 |             model_output = model(torch.reshape(most_likely_generation, (1, -1)),
109 |                                  labels=target_ids,
110 |                                  output_hidden_states=True)
111 |             hidden_states = model_output['hidden_states']
112 |             average_neg_log_likelihood_of_most_likely_gen = model_output['loss']
113 |             most_likely_generation_embedding = torch.mean(hidden_states[-1], dim=1)
114 | 
115 |             second_most_likely_generation = sample['second_most_likely_generation_ids'].to(device)
116 |             target_ids = second_most_likely_generation.clone()
117 |             target_ids[:len(prompt)] = -100
118 |             model_output = model(torch.reshape(second_most_likely_generation, (1, -1)),
119 |                                  labels=target_ids,
120 |                                  output_hidden_states=True)
121 |             hidden_states = model_output['hidden_states']
122 |             average_neg_log_likelihood_of_second_most_likely_gen = model_output['loss']
123 |             second_most_likely_generation_embedding = torch.mean(hidden_states[-1], dim=1)
124 | 
125 |             neg_log_likelihood_of_most_likely_gen = average_neg_log_likelihood_of_most_likely_gen * (
126 |                 len(most_likely_generation) - len(prompt))
127 | 
128 |             sequence_embeddings = torch.stack(sequence_embeddings)
129 |             result_dict['prompt'] = prompt
130 |             result_dict['generations'] = generations
131 |             result_dict['average_neg_log_likelihoods'] = average_neg_log_likelihoods
132 |             result_dict['neg_log_likelihoods'] = neg_log_likelihoods
133 |             result_dict['sequence_embeddings'] = most_likely_generation_embedding
134 |             result_dict['most_likely_sequence_embedding'] = most_likely_generation
135 |             result_dict['average_unconditioned_neg_log_likelihoods'] = average_unconditioned_neg_log_likelihoods
136 |             result_dict['neg_unconditioned_log_likelihoods'] = neg_unconditioned_log_likelihoods
137 |             result_dict['pointwise_mutual_information'] = pointwise_mutual_information
138 |             result_dict['average_neg_log_likelihood_of_most_likely_gen'] = average_neg_log_likelihood_of_most_likely_gen
139 |             result_dict[
140 |                 'average_neg_log_likelihood_of_second_most_likely_gen'] = average_neg_log_likelihood_of_second_most_likely_gen
141 |             result_dict['neg_log_likelihood_of_most_likely_gen'] = neg_log_likelihood_of_most_likely_gen
142 |             result_dict['semantic_set_ids'] = torch.tensor(similarities_dict[id_[0]]['semantic_set_ids'], device=device)
143 |             result_dict['id'] = id_
144 |             result.append(result_dict)
145 | 
146 |         return result
147 | 
148 | 
149 | likelihoods = get_neg_loglikelihoods(model, sequences)
150 | 
151 | with open(f'{config.data_dir}/{run_name}/{args.generation_model}_generations_{args.evaluation_model}_likelihoods.pkl',
152 |           'wb') as outfile:
153 |     pickle.dump(likelihoods, outfile)
154 | 


--------------------------------------------------------------------------------
/code/compute_confidence_measure.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pickle
  4 | import random
  5 | 
  6 | import config
  7 | import numpy as np
  8 | import torch
  9 | import wandb
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument('--generation_model', type=str, default='opt-350m')
 13 | parser.add_argument('--evaluation_model', type=str, default='opt-350m')
 14 | parser.add_argument('--run_id', type=str, default='run_1')
 15 | parser.add_argument('--verbose', type=bool, default=True)
 16 | args = parser.parse_args()
 17 | 
 18 | device = 'cuda'
 19 | 
 20 | # Set a seed value
 21 | seed_value = 10
 22 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
 23 | 
 24 | os.environ['PYTHONHASHSEED'] = str(seed_value)
 25 | # 2. Set `python` built-in pseudo-random generator at a fixed value
 26 | 
 27 | random.seed(seed_value)
 28 | # 3. Set `numpy` pseudo-random generator at a fixed value
 29 | 
 30 | np.random.seed(seed_value)
 31 | 
 32 | #Fix torch random seed
 33 | torch.manual_seed(seed_value)
 34 | 
 35 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache
 36 | 
 37 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow')
 38 | 
 39 | run_name = wandb.run.name
 40 | 
 41 | llh_shift = torch.tensor(5.0)
 42 | 
 43 | 
 44 | def get_overall_log_likelihoods(list_of_results):
 45 |     """Compute log likelihood of all generations under their given context.
 46 |     
 47 |     list_of_results: list of dictionaries with keys:
 48 |     
 49 |     returns: dictionary with keys: 'neg_log_likelihoods', 'average_neg_log_likelihoods'
 50 |              that contains tensors of shape (num_models, num_generations, num_samples_per_generation)
 51 |     """
 52 | 
 53 |     result_dict = {}
 54 | 
 55 |     list_of_keys = ['neg_log_likelihoods', 'average_neg_log_likelihoods', 'sequence_embeddings',\
 56 |                     'pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen',\
 57 |                     'neg_log_likelihood_of_most_likely_gen', 'semantic_set_ids']
 58 | 
 59 |     for key in list_of_keys:
 60 |         list_of_ids = []
 61 |         overall_results = []
 62 |         for model_size, result in list_of_results:
 63 |             results_per_model = []
 64 |             for sample in result:
 65 |                 average_neg_log_likelihoods = sample[key]
 66 |                 list_of_ids.append(sample['id'][0])
 67 |                 results_per_model.append(average_neg_log_likelihoods)
 68 | 
 69 |             results_per_model = torch.stack(results_per_model)
 70 | 
 71 |             overall_results.append(results_per_model)
 72 | 
 73 |         if key != 'sequence_embeddings':
 74 |             overall_results = torch.stack(overall_results)
 75 | 
 76 |         result_dict[key] = overall_results
 77 | 
 78 |     result_dict['ids'] = list_of_ids
 79 |     return result_dict
 80 | 
 81 | 
 82 | def get_mutual_information(log_likelihoods):
 83 |     """Compute confidence measure for a given set of likelihoods"""
 84 | 
 85 |     mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0]))
 86 |     tiled_mean = mean_across_models.tile(log_likelihoods.shape[0], 1, 1)
 87 |     diff_term = torch.exp(log_likelihoods) * log_likelihoods - torch.exp(tiled_mean) * tiled_mean
 88 |     f_j = torch.div(torch.sum(diff_term, dim=0), diff_term.shape[0])
 89 |     mutual_information = torch.div(torch.sum(torch.div(f_j, mean_across_models), dim=1), f_j.shape[-1])
 90 | 
 91 |     return mutual_information
 92 | 
 93 | 
 94 | def get_log_likelihood_variance(neg_log_likelihoods):
 95 |     """Compute log likelihood variance of approximate posterior predictive"""
 96 |     mean_across_models = torch.mean(neg_log_likelihoods, dim=0)
 97 |     variance_of_neg_log_likelihoods = torch.var(mean_across_models, dim=1)
 98 | 
 99 |     return variance_of_neg_log_likelihoods
100 | 
101 | 
102 | def get_log_likelihood_mean(neg_log_likelihoods):
103 |     """Compute softmax variance of approximate posterior predictive"""
104 |     mean_across_models = torch.mean(neg_log_likelihoods, dim=0)
105 |     mean_of_neg_log_likelihoods = torch.mean(mean_across_models, dim=1)
106 | 
107 |     return mean_of_neg_log_likelihoods
108 | 
109 | 
110 | def get_mean_of_poinwise_mutual_information(pointwise_mutual_information):
111 |     """Compute mean of pointwise mutual information"""
112 |     mean_across_models = torch.mean(pointwise_mutual_information, dim=0)
113 |     return torch.mean(mean_across_models, dim=1)
114 | 
115 | 
116 | def get_predictive_entropy(log_likelihoods):
117 |     """Compute predictive entropy of approximate posterior predictive"""
118 |     mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0]))
119 |     entropy = -torch.sum(mean_across_models, dim=1) / torch.tensor(mean_across_models.shape[1])
120 |     return entropy
121 | 
122 | 
123 | def get_predictive_entropy_over_concepts(log_likelihoods, semantic_set_ids):
124 |     """Compute the semantic entropy"""
125 |     mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0]))
126 |     # This is ok because all the models have the same semantic set ids
127 |     semantic_set_ids = semantic_set_ids[0]
128 |     entropies = []
129 |     for row_index in range(mean_across_models.shape[0]):
130 |         aggregated_likelihoods = []
131 |         row = mean_across_models[row_index]
132 |         semantic_set_ids_row = semantic_set_ids[row_index]
133 |         for semantic_set_id in torch.unique(semantic_set_ids_row):
134 |             aggregated_likelihoods.append(torch.logsumexp(row[semantic_set_ids_row == semantic_set_id], dim=0))
135 |         aggregated_likelihoods = torch.tensor(aggregated_likelihoods) - llh_shift
136 |         entropy = - torch.sum(aggregated_likelihoods, dim=0) / torch.tensor(aggregated_likelihoods.shape[0])
137 |         entropies.append(entropy)
138 | 
139 |     return torch.tensor(entropies)
140 | 
141 | 
142 | def get_margin_probability_uncertainty_measure(log_likelihoods):
143 |     """Compute margin probability uncertainty measure"""
144 |     mean_across_models = torch.logsumexp(log_likelihoods, dim=0) - torch.log(torch.tensor(log_likelihoods.shape[0]))
145 |     topk_likelihoods, indices = torch.topk(mean_across_models, 2, dim=1, sorted=True)
146 |     margin_probabilities = np.exp(topk_likelihoods[:, 0]) - np.exp(topk_likelihoods[:, 1])
147 | 
148 |     return margin_probabilities
149 | 
150 | 
151 | list_of_results = []
152 | 
153 | with open(f'{config.output_dir}/{run_name}/{args.generation_model}_generations_{args.evaluation_model}_likelihoods.pkl',
154 |           'rb') as infile:
155 |     sequences = pickle.load(infile)
156 |     list_of_results.append((args.evaluation_model, sequences))
157 | 
158 | overall_results = get_overall_log_likelihoods(list_of_results)
159 | mutual_information = get_mutual_information(-overall_results['neg_log_likelihoods'])
160 | predictive_entropy = get_predictive_entropy(-overall_results['neg_log_likelihoods'])
161 | predictive_entropy_over_concepts = get_predictive_entropy_over_concepts(-overall_results['average_neg_log_likelihoods'],
162 |                                                                         overall_results['semantic_set_ids'])
163 | unnormalised_entropy_over_concepts = get_predictive_entropy_over_concepts(-overall_results['neg_log_likelihoods'],
164 |                                                                           overall_results['semantic_set_ids'])
165 | 
166 | margin_measures = get_margin_probability_uncertainty_measure(-overall_results['average_neg_log_likelihoods'])
167 | unnormalised_margin_measures = get_margin_probability_uncertainty_measure(-overall_results['neg_log_likelihoods'])
168 | 
169 | 
170 | def get_number_of_unique_elements_per_row(tensor):
171 |     assert len(tensor.shape) == 2
172 |     return torch.count_nonzero(torch.sum(torch.nn.functional.one_hot(tensor), dim=1), dim=1)
173 | 
174 | 
175 | number_of_semantic_sets = get_number_of_unique_elements_per_row(overall_results['semantic_set_ids'][0])
176 | average_predictive_entropy = get_predictive_entropy(-overall_results['average_neg_log_likelihoods'])
177 | average_predictive_entropy_on_subsets = []
178 | predictive_entropy_on_subsets = []
179 | semantic_predictive_entropy_on_subsets = []
180 | num_predictions = overall_results['average_neg_log_likelihoods'].shape[-1]
181 | number_of_semantic_sets_on_subsets = []
182 | for i in range(1, num_predictions + 1):
183 |     offset = num_predictions * (i / 100)
184 |     average_predictive_entropy_on_subsets.append(
185 |         get_predictive_entropy(-overall_results['average_neg_log_likelihoods'][:, :, :int(i)]))
186 |     predictive_entropy_on_subsets.append(get_predictive_entropy(-overall_results['neg_log_likelihoods'][:, :, :int(i)]))
187 |     semantic_predictive_entropy_on_subsets.append(
188 |         get_predictive_entropy_over_concepts(-overall_results['average_neg_log_likelihoods'][:, :, :int(i)],
189 |                                              overall_results['semantic_set_ids'][:, :, :int(i)]))
190 |     number_of_semantic_sets_on_subsets.append(
191 |         get_number_of_unique_elements_per_row(overall_results['semantic_set_ids'][0][:, :i]))
192 | 
193 | average_pointwise_mutual_information = get_mean_of_poinwise_mutual_information(
194 |     overall_results['pointwise_mutual_information'])
195 | 
196 | overall_results['mutual_information'] = mutual_information
197 | overall_results['predictive_entropy'] = predictive_entropy
198 | overall_results['predictive_entropy_over_concepts'] = predictive_entropy_over_concepts
199 | overall_results['unnormalised_entropy_over_concepts'] = unnormalised_entropy_over_concepts
200 | overall_results['number_of_semantic_sets'] = number_of_semantic_sets
201 | overall_results['margin_measures'] = margin_measures
202 | overall_results['unnormalised_margin_measures'] = unnormalised_margin_measures
203 | 
204 | overall_results['average_predictive_entropy'] = average_predictive_entropy
205 | for i in range(len(average_predictive_entropy_on_subsets)):
206 |     overall_results[f'average_predictive_entropy_on_subset_{i + 1}'] = average_predictive_entropy_on_subsets[i]
207 |     overall_results[f'predictive_entropy_on_subset_{i + 1}'] = predictive_entropy_on_subsets[i]
208 |     overall_results[f'semantic_predictive_entropy_on_subset_{i + 1}'] = semantic_predictive_entropy_on_subsets[i]
209 |     overall_results[f'number_of_semantic_sets_on_subset_{i + 1}'] = number_of_semantic_sets_on_subsets[i]
210 | overall_results['average_pointwise_mutual_information'] = average_pointwise_mutual_information
211 | 
212 | with open(f'{config.output_dir}/{run_name}/aggregated_likelihoods_{args.generation_model}_generations.pkl',
213 |           'wb') as outfile:
214 |     pickle.dump(overall_results, outfile)
215 | 
216 | if args.verbose:
217 |     print('Margin measure', margin_measures)
218 |     print('Number of semantic sets', number_of_semantic_sets)
219 |     print('predicitve entropy shape: ', predictive_entropy.shape)
220 |     print('predicitve entropy per concept shape: ', predictive_entropy_over_concepts.shape)
221 |     print(overall_results['average_neg_log_likelihoods'].shape)
222 |     print(len(number_of_semantic_sets_on_subsets))
223 |     print(number_of_semantic_sets_on_subsets[0].shape)
224 |     print('average predictive entropy on subsets: ', len(average_predictive_entropy_on_subsets))
225 |     print(average_predictive_entropy_on_subsets[0].shape)
226 |     print(overall_results['pointwise_mutual_information'])
227 |     print(overall_results['margin_measures'])
228 | 


--------------------------------------------------------------------------------
/code/generate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pathlib
  4 | import pickle
  5 | from lib2to3.pgen2.tokenize import tokenize
  6 | 
  7 | import accelerate
  8 | import config
  9 | import datasets
 10 | import evaluate
 11 | import numpy as np
 12 | import torch
 13 | import tqdm
 14 | import wandb
 15 | from transformers import AutoModelForCausalLM, AutoTokenizer
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--type_of_question', type=str)
 19 | parser.add_argument('--num_generations_per_prompt', type=int, default=5)
 20 | parser.add_argument('--fraction_of_data_to_use', type=float, default=0.9)
 21 | parser.add_argument('--model', type=str, default='opt-350m')
 22 | parser.add_argument('--run_id', type=str, default='run_1')
 23 | parser.add_argument('--temperature', type=float, default='1.0')
 24 | parser.add_argument('--num_beams', type=int, default='5')
 25 | parser.add_argument('--decoding_method', type=str, default='beam_search')
 26 | parser.add_argument('--top_p', type=float, default=1.0)
 27 | parser.add_argument('--dataset', type=str, default='coqa')
 28 | args = parser.parse_args()
 29 | 
 30 | wandb.init(project='nlg_uncertainty', id=args.run_id, config=args, resume='allow')
 31 | 
 32 | run_name = wandb.run.name
 33 | 
 34 | device = 'cuda'
 35 | 
 36 | # Set a seed value
 37 | seed_value = 10
 38 | # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
 39 | import os
 40 | 
 41 | os.environ['PYTHONHASHSEED'] = str(seed_value)
 42 | # 2. Set `python` built-in pseudo-random generator at a fixed value
 43 | import random
 44 | 
 45 | random.seed(seed_value)
 46 | # 3. Set `numpy` pseudo-random generator at a fixed value
 47 | np.random.seed(seed_value)
 48 | 
 49 | #Fix torch random seed
 50 | torch.manual_seed(seed_value)
 51 | 
 52 | os.environ["HF_DATASETS_CACHE"] = config.hf_datasets_cache
 53 | 
 54 | model = AutoModelForCausalLM.from_pretrained(f"facebook/{args.model}",
 55 |                                              torch_dtype=torch.float16,
 56 |                                              cache_dir=config.hf_cache_dir).cuda()
 57 | 
 58 | if args.model == 'opt-30b':
 59 |     accelerate.dispatch_model(model, device_map=config.device_map)
 60 | 
 61 | tokenizer = AutoTokenizer.from_pretrained(f"facebook/{args.model}", use_fast=False, cache_dir=config.hf_cache_dir)
 62 | 
 63 | opt_models = ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b', 'opt-6.7b', 'opt-13b', 'opt-30b']
 64 | 
 65 | if args.dataset == 'coqa':
 66 |     dataset = datasets.load_from_disk(f'{config.output_dir}/coqa_dataset')
 67 |     id_to_question_mapping = dict(zip(dataset['id'], dataset['question']))
 68 | elif args.dataset == 'trivia_qa':
 69 |     dataset = datasets.load_from_disk(f'{config.output_dir}/trivia_qa')
 70 | 
 71 | if args.fraction_of_data_to_use < 1.0:
 72 |     train_dataset = dataset.train_test_split(test_size=(1 - args.fraction_of_data_to_use), seed=seed_value)['train']
 73 | else:
 74 |     train_dataset = dataset
 75 | 
 76 | 
 77 | def encode(examples):
 78 |     return tokenizer(examples['story'] + ' Q: ' + examples['question'] + ' A:', truncation=False, padding=False)
 79 | 
 80 | 
 81 | def encode_and_format_dataset(dataset):
 82 |     dataset = dataset.map(encode, batched=False, load_from_cache_file=False)
 83 |     dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'], output_all_columns=True)
 84 | 
 85 |     return dataset
 86 | 
 87 | 
 88 | if args.dataset == 'coqa':
 89 |     questions = encode_and_format_dataset(train_dataset)
 90 | elif args.dataset == 'trivia_qa':
 91 |     questions = train_dataset
 92 | 
 93 | dataloader = torch.utils.data.DataLoader(questions, batch_size=1)
 94 | 
 95 | period_token_id = tokenizer('. ')['input_ids'][1]
 96 | eos_tokens = ['Question:', ' Question:', '\n', 'Answer:', ' Answer:', 'Q:']
 97 | question_framing_ids = [[tokenizer(eos_token)['input_ids'][1]] for eos_token in eos_tokens]
 98 | squad_metric = evaluate.load("squad")
 99 | rouge = evaluate.load('rouge')
100 | exact_match_metric = evaluate.load("exact_match")
101 | 
102 | 
103 | def get_generations(model, dataloader, number_of_generations):
104 |     """For a given model, produce a number of generation """
105 | 
106 |     with torch.no_grad():
107 |         max_length_of_generated_sequence = 256
108 |         sequences = []
109 |         for batch in tqdm.tqdm(dataloader):
110 | 
111 |             input_ids = torch.cat(batch['input_ids']).to(device).reshape(
112 |                 1, -1) if args.dataset == 'trivia_qa' else batch['input_ids'].to(device)
113 |             if args.decoding_method == 'beam_search':
114 |                 most_likely_generation = model.generate(input_ids,
115 |                                                         num_beams=5,
116 |                                                         num_return_sequences=2,
117 |                                                         do_sample=False,
118 |                                                         max_length=input_ids.shape[1] +
119 |                                                         max_length_of_generated_sequence,
120 |                                                         eos_token_id=period_token_id,
121 |                                                         bad_words_ids=question_framing_ids)
122 |             elif args.decoding_method == 'greedy':
123 |                 most_likely_generation = model.generate(input_ids,
124 |                                                         num_beams=1,
125 |                                                         do_sample=False,
126 |                                                         max_length=input_ids.shape[1] +
127 |                                                         max_length_of_generated_sequence,
128 |                                                         eos_token_id=period_token_id,
129 |                                                         bad_words_ids=question_framing_ids)
130 | 
131 |             input_length = input_ids.shape[1] if args.dataset == 'trivia_qa' else batch['input_ids'].shape[1]
132 |             generations = torch.ones((number_of_generations, input_length + max_length_of_generated_sequence),
133 |                                      dtype=torch.long,
134 |                                      device=device)
135 |             for i in range(number_of_generations):
136 | 
137 |                 generation = model.generate(input_ids,
138 |                                             do_sample=True,
139 |                                             num_return_sequences=1,
140 |                                             num_beams=args.num_beams,
141 |                                             max_length=input_ids.shape[1] + max_length_of_generated_sequence,
142 |                                             eos_token_id=period_token_id,
143 |                                             temperature=args.temperature,
144 |                                             bad_words_ids=question_framing_ids,
145 |                                             top_p=args.top_p)
146 |                 generations[i, :generation.shape[1]] = generation
147 | 
148 |             generations = torch.reshape(generations, (-1, number_of_generations, generations.shape[-1]))
149 |             for i in range(generations.shape[0]):
150 | 
151 |                 if args.dataset == 'coqa':
152 |                     sequence_dict = {
153 |                         'prompt': batch['input_ids'][i].to('cpu'),
154 |                         'generations': generations[i].to('cpu'),
155 |                         'id': batch['id'],
156 |                         'question': id_to_question_mapping[batch['id'][0]]
157 |                     }
158 |                 elif args.dataset == 'trivia_qa':
159 |                     few_shot_question = tokenizer.decode(input_ids[0])
160 |                     question = few_shot_question.split('Question: ')[-1].split('Answer: ')[0]
161 |                     sequence_dict = {
162 |                         'prompt': input_ids[0],
163 |                         'generations': generations[i],
164 |                         'id': batch['question_id'],
165 |                         'few_shot_question': tokenizer.decode(input_ids[0]),
166 |                         'question': question
167 |                     }
168 | 
169 |                 generated_texts = []
170 |                 for generation in generations[i]:
171 |                     generated_texts.append(
172 |                         tokenizer.decode(generation[len(batch['input_ids'][i]):], skip_special_tokens=True))
173 | 
174 |                 sequence_dict['generated_texts'] = generated_texts
175 |                 sequence_dict['most_likely_generation_ids'] = most_likely_generation[0].to('cpu')
176 |                 sequence_dict['most_likely_generation'] = tokenizer.decode(
177 |                     most_likely_generation[0][len(batch['input_ids'][i]):], skip_special_tokens=True)
178 | 
179 |                 sequence_dict['second_most_likely_generation_ids'] = most_likely_generation[1].to('cpu')
180 |                 sequence_dict['second_most_likely_generation'] = tokenizer.decode(
181 |                     most_likely_generation[1][len(batch['input_ids'][i]):], skip_special_tokens=True)
182 | 
183 |                 sequence_dict['semantic_variability_reference_answers'] = batch[
184 |                     'semantic_variability'] if 'semantic_variability' in batch else None
185 |                 rouge_types = ['rouge1', 'rouge2', 'rougeL']
186 |                 for rouge_type in rouge_types:
187 |                     if rouge_type in batch:
188 |                         sequence_dict[rouge_type + '_reference_answers'] = batch[rouge_type]
189 | 
190 |                     else:
191 |                         sequence_dict[rouge_type + '_reference_answers'] = None
192 | 
193 |                     sequence_dict[rouge_type + '_to_target'] = 0.0
194 | 
195 |                 sequence_dict['answer'] = batch['answer']['text'] if args.dataset == 'coqa' else batch['answer']
196 |                 sequence_dict['additional_answers'] = [x[0] for x in batch['additional_answers']
197 |                                                       ] if args.dataset == 'coqa' else None
198 | 
199 |                 sequence_dict['exact_match'] = 0.0
200 | 
201 |                 reference_answers = batch['answer']['text'] + [x[0] for x in batch['additional_answers']
202 |                                                               ] if args.dataset == 'coqa' else batch['answer']
203 | 
204 |                 for answer in reference_answers:
205 |                     predictions = [sequence_dict['most_likely_generation'].lstrip()]
206 |                     references = [answer]
207 |                     results = exact_match_metric.compute(predictions=predictions,
208 |                                                          references=references,
209 |                                                          ignore_case=True,
210 |                                                          ignore_punctuation=True)
211 |                     sequence_dict['exact_match'] = max(results['exact_match'], sequence_dict['exact_match'])
212 |                     rouge_results = rouge.compute(predictions=predictions, references=references)
213 |                     for rouge_type in rouge_types:
214 |                         sequence_dict[rouge_type + '_to_target'] = max(rouge_results[rouge_type].mid.fmeasure,
215 |                                                                        sequence_dict[rouge_type + '_to_target'])
216 | 
217 |                 sequences.append(sequence_dict)
218 | 
219 |     return sequences
220 | 
221 | 
222 | sequences = get_generations(model, dataloader, args.num_generations_per_prompt)
223 | 
224 | pathlib.Path(f'{config.output_dir}/sequences/' + run_name).mkdir(parents=True, exist_ok=True)
225 | 
226 | with open(f'{config.output_dir}/sequences/{run_name}/{args.model}_generations.pkl', 'wb') as outfile:
227 |     pickle.dump(sequences, outfile)
228 | 


--------------------------------------------------------------------------------
/code/analyze_results.py:
--------------------------------------------------------------------------------
  1 | # parse arguments
  2 | import argparse
  3 | import json
  4 | import pickle
  5 | 
  6 | import config
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sklearn
 10 | import sklearn.metrics
 11 | import torch
 12 | import wandb
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('-n', '--run_ids', nargs='+', default=[])
 16 | parser.add_argument('--verbose', type=bool, default=True)
 17 | args = parser.parse_args()
 18 | 
 19 | overall_result_dict = {}
 20 | 
 21 | aurocs_across_models = []
 22 | 
 23 | sequence_embeddings_dict = {}
 24 | 
 25 | run_ids_to_analyze = args.run_ids
 26 | for run_id in run_ids_to_analyze:
 27 | 
 28 |     wandb.init(project='nlg_uncertainty', id=run_id, resume='allow')
 29 |     run_name = wandb.run.name
 30 |     model_name = wandb.config.model
 31 |     print(run_name)
 32 | 
 33 |     def get_similarities_df():
 34 |         """Get the similarities df from the pickle file"""
 35 |         with open(f'{config.output_dir}/{run_name}/{model_name}_generations_similarities.pkl', 'rb') as f:
 36 |             similarities = pickle.load(f)
 37 |             similarities_df = pd.DataFrame.from_dict(similarities, orient='index')
 38 |             similarities_df['id'] = similarities_df.index
 39 |             similarities_df['has_semantically_different_answers'] = similarities_df[
 40 |                 'has_semantically_different_answers'].astype('int')
 41 |             similarities_df['rougeL_among_generations'] = similarities_df['syntactic_similarities'].apply(
 42 |                 lambda x: x['rougeL'])
 43 | 
 44 |             return similarities_df
 45 | 
 46 |     def get_generations_df():
 47 |         """Get the generations df from the pickle file"""
 48 |         with open(f'{config.output_dir}/{run_name}/{model_name}_generations.pkl', 'rb') as infile:
 49 |             generations = pickle.load(infile)
 50 |             generations_df = pd.DataFrame(generations)
 51 |             generations_df['id'] = generations_df['id'].apply(lambda x: x[0])
 52 |             generations_df['id'] = generations_df['id'].astype('object')
 53 |             if not generations_df['semantic_variability_reference_answers'].isnull().values.any():
 54 |                 generations_df['semantic_variability_reference_answers'] = generations_df[
 55 |                     'semantic_variability_reference_answers'].apply(lambda x: x[0].item())
 56 | 
 57 |             if not generations_df['rougeL_reference_answers'].isnull().values.any():
 58 |                 generations_df['rougeL_reference_answers'] = generations_df['rougeL_reference_answers'].apply(
 59 |                     lambda x: x[0].item())
 60 |             generations_df['length_of_most_likely_generation'] = generations_df['most_likely_generation'].apply(
 61 |                 lambda x: len(str(x).split(' ')))
 62 |             generations_df['length_of_answer'] = generations_df['answer'].apply(lambda x: len(str(x).split(' ')))
 63 |             generations_df['variance_of_length_of_generations'] = generations_df['generated_texts'].apply(
 64 |                 lambda x: np.var([len(str(y).split(' ')) for y in x]))
 65 |             generations_df['correct'] = (generations_df['rougeL_to_target'] > 0.3).astype('int')
 66 | 
 67 |             return generations_df
 68 | 
 69 |     def get_likelihoods_df():
 70 |         """Get the likelihoods df from the pickle file"""
 71 | 
 72 |         with open(f'{config.output_dir}/{run_name}/aggregated_likelihoods_{model_name}_generations.pkl', 'rb') as f:
 73 |             likelihoods = pickle.load(f)
 74 |             print(likelihoods.keys())
 75 | 
 76 |             subset_keys = ['average_predictive_entropy_on_subset_' + str(i) for i in range(1, num_generations + 1)]
 77 |             subset_keys += ['predictive_entropy_on_subset_' + str(i) for i in range(1, num_generations + 1)]
 78 |             subset_keys += ['semantic_predictive_entropy_on_subset_' + str(i) for i in range(1, num_generations + 1)]
 79 |             subset_keys += ['number_of_semantic_sets_on_subset_' + str(i) for i in range(1, num_generations + 1)]
 80 | 
 81 |             keys_to_use = ('ids', 'predictive_entropy', 'mutual_information', 'average_predictive_entropy',\
 82 |                             'average_pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen',\
 83 |                             'average_neg_log_likelihood_of_second_most_likely_gen', 'neg_log_likelihood_of_most_likely_gen',\
 84 |                             'predictive_entropy_over_concepts', 'number_of_semantic_sets', 'unnormalised_entropy_over_concepts')
 85 | 
 86 |             likelihoods_small = dict((k, likelihoods[k]) for k in keys_to_use + tuple(subset_keys))
 87 |             for key in likelihoods_small:
 88 |                 if key == 'average_predictive_entropy_on_subsets':
 89 |                     likelihoods_small[key].shape
 90 |                 if type(likelihoods_small[key]) is torch.Tensor:
 91 |                     likelihoods_small[key] = torch.squeeze(likelihoods_small[key].cpu())
 92 | 
 93 |             sequence_embeddings = likelihoods['sequence_embeddings']
 94 | 
 95 |             likelihoods_df = pd.DataFrame.from_dict(likelihoods_small)
 96 | 
 97 |             likelihoods_df.rename(columns={'ids': 'id'}, inplace=True)
 98 | 
 99 |             return likelihoods_df, sequence_embeddings
100 | 
101 |     similarities_df = get_similarities_df()
102 |     generations_df = get_generations_df()
103 |     num_generations = len(generations_df['generated_texts'][0])
104 |     likelihoods_df, sequence_embeddings = get_likelihoods_df()
105 |     result_df = generations_df.merge(similarities_df, on='id').merge(likelihoods_df, on='id')
106 | 
107 |     n_samples_before_filtering = len(result_df)
108 |     result_df['len_most_likely_generation_length'] = result_df['most_likely_generation'].apply(lambda x: len(x.split()))
109 | 
110 |     # Begin analysis
111 |     result_dict = {}
112 |     result_dict['accuracy'] = result_df['correct'].mean()
113 | 
114 |     # Compute the auroc for the length normalized predictive entropy
115 |     ln_predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'],
116 |                                                                 result_df['average_predictive_entropy'])
117 |     result_dict['ln_predictive_entropy_auroc'] = ln_predictive_entropy_auroc
118 | 
119 |     predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'], result_df['predictive_entropy'])
120 |     result_dict['predictive_entropy_auroc'] = predictive_entropy_auroc
121 | 
122 |     entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'],
123 |                                                                 result_df['predictive_entropy_over_concepts'])
124 |     result_dict['entropy_over_concepts_auroc'] = entropy_over_concepts_auroc
125 | 
126 |     if 'unnormalised_entropy_over_concepts' in result_df.columns:
127 |         unnormalised_entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(
128 |             1 - result_df['correct'], result_df['unnormalised_entropy_over_concepts'])
129 |         result_dict['unnormalised_entropy_over_concepts_auroc'] = unnormalised_entropy_over_concepts_auroc
130 | 
131 |     aurocs_across_models.append(entropy_over_concepts_auroc)
132 | 
133 |     neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'],
134 |                                                                   result_df['neg_log_likelihood_of_most_likely_gen'])
135 |     result_dict['neg_llh_most_likely_gen_auroc'] = neg_llh_most_likely_gen_auroc
136 | 
137 |     number_of_semantic_sets_auroc = sklearn.metrics.roc_auc_score(1 - result_df['correct'],
138 |                                                                   result_df['number_of_semantic_sets'])
139 |     result_dict['number_of_semantic_sets_auroc'] = number_of_semantic_sets_auroc
140 | 
141 |     result_dict['number_of_semantic_sets_correct'] = result_df[result_df['correct'] ==
142 |                                                                1]['number_of_semantic_sets'].mean()
143 |     result_dict['number_of_semantic_sets_incorrect'] = result_df[result_df['correct'] ==
144 |                                                                  0]['number_of_semantic_sets'].mean()
145 | 
146 |     result_dict['average_rougeL_among_generations'] = result_df['rougeL_among_generations'].mean()
147 |     result_dict['average_rougeL_among_generations_correct'] = result_df[result_df['correct'] ==
148 |                                                                         1]['rougeL_among_generations'].mean()
149 |     result_dict['average_rougeL_among_generations_incorrect'] = result_df[result_df['correct'] ==
150 |                                                                           0]['rougeL_among_generations'].mean()
151 |     result_dict['average_rougeL_auroc'] = sklearn.metrics.roc_auc_score(result_df['correct'],
152 |                                                                         result_df['rougeL_among_generations'])
153 | 
154 |     average_neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(
155 |         1 - result_df['correct'], result_df['average_neg_log_likelihood_of_most_likely_gen'])
156 |     result_dict['average_neg_llh_most_likely_gen_auroc'] = average_neg_llh_most_likely_gen_auroc
157 |     result_dict['rougeL_based_accuracy'] = result_df['correct'].mean()
158 | 
159 |     result_dict['margin_measure_auroc'] = sklearn.metrics.roc_auc_score(
160 |         1 - result_df['correct'], result_df['average_neg_log_likelihood_of_most_likely_gen'] +
161 |         result_df['average_neg_log_likelihood_of_second_most_likely_gen'])
162 | 
163 |     if args.verbose:
164 |         print('Number of samples:', len(result_df))
165 |         print(result_df['predictive_entropy'].mean())
166 |         print(result_df['average_predictive_entropy'].mean())
167 |         print(result_df['predictive_entropy_over_concepts'].mean())
168 |         print('ln_predictive_entropy_auroc', ln_predictive_entropy_auroc)
169 |         print('semantci entropy auroc', entropy_over_concepts_auroc)
170 |         print(
171 |             'Semantic entropy +',
172 |             sklearn.metrics.roc_auc_score(
173 |                 1 - result_df['correct'],
174 |                 result_df['predictive_entropy_over_concepts'] - 3 * result_df['rougeL_among_generations']))
175 |         print('RougeL among generations auroc',
176 |               sklearn.metrics.roc_auc_score(result_df['correct'], result_df['rougeL_among_generations']))
177 |         print('margin measure auroc:', result_dict['margin_measure_auroc'])
178 | 
179 |     # Measure the AURROCs when using different numbers of generations to compute our uncertainty measures.
180 |     ln_aurocs = []
181 |     aurocs = []
182 |     semantic_aurocs = []
183 |     average_number_of_semantic_sets = []
184 |     average_number_of_semantic_sets_correct = []
185 |     average_number_of_semantic_sets_incorrect = []
186 |     for i in range(1, num_generations + 1):
187 |         ln_predictive_entropy_auroc = sklearn.metrics.roc_auc_score(
188 |             1 - result_df['correct'], result_df['average_predictive_entropy_on_subset_{}'.format(i)])
189 |         aurocs.append(
190 |             sklearn.metrics.roc_auc_score(1 - result_df['correct'],
191 |                                           result_df['predictive_entropy_on_subset_{}'.format(i)]))
192 |         ln_aurocs.append(ln_predictive_entropy_auroc)
193 |         semantic_aurocs.append(
194 |             sklearn.metrics.roc_auc_score(1 - result_df['correct'],
195 |                                           result_df['semantic_predictive_entropy_on_subset_{}'.format(i)]))
196 |         average_number_of_semantic_sets.append(result_df['number_of_semantic_sets_on_subset_{}'.format(i)].mean())
197 |         average_number_of_semantic_sets_correct.append(
198 |             result_df[result_df['correct'] == 1]['number_of_semantic_sets_on_subset_{}'.format(i)].mean())
199 |         average_number_of_semantic_sets_incorrect.append(
200 |             result_df[result_df['correct'] == 0]['number_of_semantic_sets_on_subset_{}'.format(i)].mean())
201 | 
202 |     result_dict['ln_predictive_entropy_auroc_on_subsets'] = ln_aurocs
203 |     result_dict['predictive_entropy_auroc_on_subsets'] = aurocs
204 |     result_dict['semantic_predictive_entropy_auroc_on_subsets'] = semantic_aurocs
205 |     result_dict['average_number_of_semantic_sets_on_subsets'] = average_number_of_semantic_sets
206 |     result_dict['average_number_of_semantic_sets_on_subsets_correct'] = average_number_of_semantic_sets_correct
207 |     result_dict['average_number_of_semantic_sets_on_subsets_incorrect'] = average_number_of_semantic_sets_incorrect
208 |     result_dict['model_name'] = model_name
209 |     result_dict['run_name'] = run_name
210 | 
211 |     wandb.log(result_dict)
212 | 
213 |     overall_result_dict[run_id] = result_dict
214 |     sequence_embeddings_dict[run_id] = sequence_embeddings
215 | 
216 |     wandb.finish()
217 |     torch.cuda.empty_cache()
218 | 
219 | with open('overall_results.json', 'w') as f:
220 |     json.dump(overall_result_dict, f)
221 | 
222 | with open('sequence_embeddings.pkl', 'wb') as f:
223 |     pickle.dump(sequence_embeddings_dict, f)
224 | 
225 | # Store data frame as csv
226 | accuracy_verification_df = result_df[['most_likely_generation', 'answer', 'correct']]
227 | accuracy_verification_df.to_csv('accuracy_verification.csv')
228 | 


--------------------------------------------------------------------------------