├── README.md
├── assets
└── figure.png
├── configs
├── default_prompts
│ ├── ag_news.j2
│ ├── anli.j2
│ ├── boolq.j2
│ ├── cosmos_qa.j2
│ ├── hellaswag.j2
│ ├── imdb.j2
│ ├── nq_open.j2
│ ├── trivia_qa.j2
│ └── tweet_emotion.j2
└── metric
│ ├── few_shot_accuracy_defaults.yaml
│ ├── permutational_sensitivity_defaults.yaml
│ ├── perturbational_accuracy_defaults.yaml
│ ├── selectional_sensitivity_defaults.yaml
│ └── zero_shot_accuracy_defaults.yaml
├── data
├── __init__.py
├── ag_news.py
├── anli.py
├── boolq.py
├── cosmos_qa.py
├── dataset.py
├── hellaswag.py
├── imdb.py
├── nq_open.py
├── trivia_qa.py
└── tweet_emotion.py
├── decoders
├── __init__.py
├── constrained_label_generation.py
├── constrained_per_example_label_generation.py
├── decoder.py
├── greedy_generation.py
└── nucleus_generation.py
├── evaluate_instruction.py
├── instructions
├── ape
│ ├── ag_news.yaml
│ ├── anli.yaml
│ ├── boolq.yaml
│ ├── cosmos_qa.yaml
│ ├── hellaswag.yaml
│ ├── imdb.yaml
│ ├── nq_open.yaml
│ ├── trivia_qa.yaml
│ └── tweet_emotion.yaml
├── chat_gpt_prompts
│ ├── ag_news.yaml
│ ├── anli.yaml
│ ├── boolq.yaml
│ ├── cosmos_qa.yaml
│ ├── hellaswag.yaml
│ ├── imdb.yaml
│ ├── nq_open.yaml
│ ├── trivia_qa.yaml
│ └── tweet_emotion.yaml
├── generic_instruction
│ ├── ag_news.yaml
│ ├── anli.yaml
│ ├── boolq.yaml
│ ├── cosmos_qa.yaml
│ ├── hellaswag.yaml
│ ├── imdb.yaml
│ ├── nq_open.yaml
│ ├── trivia_qa.yaml
│ └── tweet_emotion.yaml
├── low_perplexity_prompts
│ ├── bloom1b1
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── bloom1b7
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── bloom3b
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── bloom7b1
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── gptneo1b3
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── gptneo2b7
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── gptneox20b
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── llama13b
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── llama7b
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── opt13b
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── opt1b3
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ ├── opt2b7
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
│ └── opt6b7
│ │ ├── ag_news.yaml
│ │ ├── anli.yaml
│ │ ├── boolq.yaml
│ │ ├── cosmos_qa.yaml
│ │ ├── hellaswag.yaml
│ │ ├── imdb.yaml
│ │ ├── nq_open.yaml
│ │ ├── trivia_qa.yaml
│ │ └── tweet_emotion.yaml
├── manual
│ ├── ag_news.yaml
│ ├── anli.yaml
│ ├── boolq.yaml
│ ├── cosmos_qa.yaml
│ ├── hellaswag.yaml
│ ├── imdb.yaml
│ ├── nq_open.yaml
│ ├── trivia_qa.yaml
│ └── tweet_emotion.yaml
├── no_instruction
│ ├── ag_news.yaml
│ ├── anli.yaml
│ ├── boolq.yaml
│ ├── cosmos_qa.yaml
│ ├── hellaswag.yaml
│ ├── imdb.yaml
│ ├── nq_open.yaml
│ ├── trivia_qa.yaml
│ └── tweet_emotion.yaml
└── rlprompt
│ ├── ag_news.yaml
│ ├── anli.yaml
│ ├── boolq.yaml
│ ├── cosmos_qa.yaml
│ ├── hellaswag.yaml
│ ├── imdb.yaml
│ └── tweet_emotion.yaml
├── metrics
├── __init__.py
├── few_shot_accuracy.py
├── metric.py
├── permutational_sensitivity.py
├── perturbational_accuracy.py
├── selectional_sensitivity.py
├── utils.py
└── zero_shot_accuracy.py
├── models
├── __init__.py
├── base.py
├── bloom1b1.py
├── bloom1b7.py
├── bloom3b.py
├── bloom7b1.py
├── causal_lm.py
├── gptneo1b3.py
├── gptneo2b7.py
├── gptneox20b.py
├── llama13b.py
├── llama7b.py
├── masked_lm.py
├── opt13b.py
├── opt1b3.py
├── opt2b7.py
├── opt6b7.py
├── stablelmbase3b.py
├── stablelmbase7b.py
├── stablelmtuned3b.py
└── stablelmtuned7b.py
├── notebooks
└── aggregate_results.ipynb
├── requirements.txt
├── templates
├── __init__.py
├── few_shot_template.py
└── instruction_based_fs_template.py
└── utils.py
/README.md:
--------------------------------------------------------------------------------
1 | # InstructEval: Systematic Evaluation of Instruction Selection Methods
2 |
3 | We release the evaluation suite used in our paper [`InstructEval: Systematic Evaluation of Instruction Selection Methods`](https://arxiv.org/abs/2307.00259).
4 |
5 |
6 |
7 |
8 |
9 | The suite allows the evaluation of arbitrary instructions and prompt templates across a set of 13 open-sourced LLM for varying scales from 4 model families, and covers 9 different tasks spanning 3 task types. The suite allows evaluation along 3 accuracy metrics
10 | * zero-shot accuracy
11 | * few-shot accuracy
12 | * perturbation accuracy
13 |
14 | and 2 sensitivity metrics
15 | * selectional sensitivity
16 | * permutational sensitivity.
17 | ## Install
18 | This evaluation suite demands `torch==1.12.1` to ensure compatibility with `crfm_helm`. You will also need `transformers>=4.28.1` to ensure compatibility with the LLaMA models.
19 |
20 | Set up a new Python 3.9 environment and install [PyTorch 1.12.1](https://pytorch.org/get-started/previous-versions/#v1121).
21 | ```bash
22 | pip install -r requirements.txt --no-deps
23 | ```
24 |
25 | ## Usage
26 | We provide a script that can be used to evaluate a single instruction on a single model and a single task, along a single metric.
27 | To evaluate an instruction on a specific model and task along a metric, use `evaluate_instruction.py`.
28 | ```bash
29 | python3 -m evaluate_instruction --instructions_dir INSTRUCTIONS_DIR --index INDEX --model MODEL --dataset DATASET --metric_config METRIC_CONFIG [--decoder DECODER] [--prompt_template_dir PROMPT_TEMPLATE_DIR] [--results_dir RESULTS_DIR]
30 | ```
31 |
32 | ### Arguments
33 |
34 | * `--instructions_dir` (required): Path to the directory containing instruction files. These files should be named based on the dataset they correspond to (eg. `ag_news.yaml`).
35 | * `--index` (required): Index of the instruction to evaluate in the dataset's instruction file. This should be an integer.
36 | * `--model` (required): Identifier of the model to use during evaluation.
37 | * `--dataset` (required): Identifier of the dataset to be used for evaluation.
38 | * `--metric_config` (required): Path to the metric configuration file which specifies both the name of the metric to evaluate along, and relevant hyperparameters.
39 | * `--decoder` (optional): Name of the decoder. If specified, the script will use this decoder for the evaluation. If not, the script will use the default decoder for the specified dataset.
40 | * `--prompt_template_dir` (optional, default: "configs/default_prompts"): Path to the directory containing Jinja2 prompt templates for each dataset.
41 | * `--results_dir` (optional, default: "results/"): Path to the directory where the script should write the results.
42 |
43 | ### Example usage:
44 | ```bash
45 | python3 -m evaluate_instruction --instructions_dir instructions/ape --index 2 --model opt13b --dataset cosmos_qa --metric_config configs/metric/perturbational_accuracy_defaults.yaml
46 | ```
47 | ### Aggregating results
48 | Results are written to a directory specified by `--results_dir` in the form of JSON files. Users can aggregate results across any desirable axes and aggregation strategies by writing custom scripts. We provide sample code for the aggregations we perform in our paper in `notebooks/aggregate_results.ipynb`.
49 |
50 | -------
51 | ### Prompts
52 | Instructions to be evaluated can be set to arbitrary strings obtained using any external means. They are to be specified as entries in YAML files named after task they correspond to (such as those in `instructions/ape/`). A specific instruction from this list can be evaluated by appropriately setting the `--index` parameter of `evaluate_instruction.py`.
53 |
54 | To evaluate a new instruction selection method, create a new directory under `instructions/` following the file-tree structure of the provided sample instructions. We support the evaluation of both model-agnostic instructions as in `instructions/ape` and model-specific instructions as in `instructions/low_perplexity_prompts`. You can then directly use `evaluate_instruction.py` as described above.
55 |
56 | Evaluations can also be conducted using arbitrary prompt templates expressed using the Jinja2 templating engine (as in `configs/default_prompts/`). Non-default prompt-templates can be specified using `--prompt_template_dir`.
57 |
58 |
59 | ### Metrics configs
60 |
61 | Metric configuration files are expected in YAML format and must specify both the name of the required metric, and the relevant hyperparameters. We provide example configuration files for each of the 5 metrics under `configs/metric/`.
62 |
63 | ### Decoders
64 | We include 4 choices of decoders with the codebase that can be used in conjunction with any supported model.
65 | * `ConstrainedLabelGeneration`: Intended to be used with CLS tasks with fixed, static label-spaces.
66 | * `ConstrainedPerExampleLabelGeneration`: Intended to be used with MCQ tasks whose label-space varies across test examples.
67 | * `GreedyGeneration`: For use with GQA tasks with unconstrained label-spaces. Implements Greedy Sampling.
68 | * `NucleusGeneration`: For use with GQA tasks with unconstrained label-spaces. Implements Nucleus Sampling.
69 |
70 | We do not implement any form of calibration in these decoders. As a user, you can straightforwadly implement new custom decoders by extending the `Decoder` class.
71 |
72 |
73 | ### Supported Models
74 | We support 13 models with sizes ranging from 1 billion to 20 billion parameters, across 4 model families.
75 | | model family | identifiers |
76 | |:------------:|:------------|
77 | | BLOOM | `bloom1b1`, `bloom1b7`, `bloom3b`, `bloom7b1` |
78 | | GPT Neo* | `gptneo1b3`, `gptneo2b7`, `gptneox20b` |
79 | | LLaMA | `llama7b`, `llama13b` |
80 | | OPT | `opt1b3`, `opt2b7`, `opt6b7`, `opt13b` |
81 |
82 | ### Supported Tasks
83 | We include support for 9 tasks across classification (CLS), multiple-choice question-answering (MCQ) and generative question-answering (GQA).
84 | | Task | Task type | identifier |
85 | |:----:|:---------:|:-----------|
86 | | AG News | CLS | `ag_news` |
87 | | ANLI | CLS | `anli` |
88 | | BoolQ | CLS | `boolq` |
89 | | IMDB | CLS | `imdb` |
90 | | TweetEval Emotion | CLS | `tweet_emotion` |
91 | | HellaSwag | MCQ | `hellaswag` |
92 | | CosmosQA | MCQ | `cosmos_qa` |
93 | | NaturalQuestions Open | GQA | `nq_open` |
94 | | TriviaQA | GQA | `trivia_qa` |
95 |
96 | -------
97 |
98 | ## Results
99 | The results of a collection of experiments we performed for our paper are available [here](https://drive.google.com/file/d/1xKuAPfmMrMb7HJeJyT_kEW3mXkj1xzzV/view?usp=sharing). The results are organized by task, model, and metric. Each file contains the results of evaluating a single instruction on a single model and task, along a single metric and some associated metadata.
100 |
101 | ## Questions?
102 | Feel free to contact `anirudh.ajith@princeton.edu` or `chris.pan@princeton.edu` if you have any questions about the evaluation suite, or our paper!
103 |
104 | ## Citation
105 | ```bibtex
106 | @misc{ajith2023instructeval,
107 | title={InstructEval: Systematic Evaluation of Instruction Selection Methods},
108 | author={Anirudh Ajith and Chris Pan and Mengzhou Xia and Ameet Deshpande and Karthik Narasimhan},
109 | year={2023},
110 | eprint={2307.00259},
111 | archivePrefix={arXiv},
112 | primaryClass={cs.CL}
113 | }
114 | ```
115 |
--------------------------------------------------------------------------------
/assets/figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/InstructEval/8abacb4ca609eb351837bec4c2b698471cf20aa6/assets/figure.png
--------------------------------------------------------------------------------
/configs/default_prompts/ag_news.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "ag_news" -%}
2 | {%- set label_map = {0: 'World', 1:'Sports', 2:'Business', 3:'Sci/Tech'} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | News: {{demonstration['text']}}
6 | Category: {{label_map[demonstration['label']]}}
7 | {% endfor %}
8 | News: {{test_example['text']}}
9 | Category: {{label_map[test_example['label']]}}
--------------------------------------------------------------------------------
/configs/default_prompts/anli.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "anli" -%}
2 | {%- set label_map = {0: 'Entail', 1: 'Neutral', 2: 'Contradict'} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | Premise: {{demonstration['premise']}}
6 | Hypothesis: {{demonstration['hypothesis']}}
7 | Relation: {{label_map[demonstration['label']]}}
8 | {% endfor %}
9 | Premise: {{test_example['premise']}}
10 | Hypothesis: {{test_example['hypothesis']}}
11 | Relation: {{label_map[test_example['label']]}}
--------------------------------------------------------------------------------
/configs/default_prompts/boolq.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "boolq" -%}
2 | {%- set label_map = {True: 'True', False: 'False'} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | Passage: {{demonstration['passage']}}
6 | Question: {{demonstration['question']}}
7 | Answer: {{label_map[demonstration['answer']]}}
8 | {% endfor %}
9 | Passage: {{test_example['passage']}}
10 | Question: {{test_example['question']}}
11 | Answer: {{label_map[test_example['answer']]}}
--------------------------------------------------------------------------------
/configs/default_prompts/cosmos_qa.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "cosmos_qa" -%}
2 | {%- set label_map = {0: None, 1: None, 2: None, 3: None} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | Passage: {{demonstration['context']}}
6 | Question: {{demonstration['question']}}
7 | Answer: {{demonstration['answer' ~ demonstration['label']]}}
8 | {% endfor %}
9 | Passage: {{test_example['context']}}
10 | Question: {{test_example['question']}}
11 | Answer: {{test_example['answer' ~ test_example['label']]}}
--------------------------------------------------------------------------------
/configs/default_prompts/hellaswag.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "hellaswag" -%}
2 | {%- set label_map = {"0": None, "1": None, "2": None, "3": None} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | Sentence: {{demonstration['ctx']}}
6 | Answer: {{demonstration['endings'][demonstration['label'] | int]}}
7 | {% endfor %}
8 | Sentence: {{test_example['ctx']}}
9 | Answer: {% if 'label' in test_example %}{{test_example['endings'][test_example['label'] | int]}}{% endif %}
--------------------------------------------------------------------------------
/configs/default_prompts/imdb.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "imdb" -%}
2 | {%- set label_map = {0: 'Negative', 1: 'Positive'} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | Review: {{demonstration['text']}}
6 | Sentiment: {{label_map[demonstration['label']]}}
7 | {% endfor %}
8 | Review: {{test_example['text']}}
9 | Sentiment: {{label_map[test_example['label']]}}
--------------------------------------------------------------------------------
/configs/default_prompts/nq_open.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "nq_open" -%}
2 | {{instruction}}
3 | {% for demonstration in demonstrations %}
4 | Question: {{demonstration['question']}}
5 | Answer: {{demonstration['answer'][0]}}
6 | {% endfor %}
7 | Question: {{test_example['question']}}
8 | Answer: {% if ('answer' in test_example) and (test_example['answer']) %}{{test_example['answer'][0]}}{% endif %}
--------------------------------------------------------------------------------
/configs/default_prompts/trivia_qa.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "trivia_qa" -%}
2 | {{instruction}}
3 | {% for demonstration in demonstrations %}
4 | Question: {{demonstration['question']}}
5 | Answer: {{demonstration['answer']['aliases'][0]}}
6 | {% endfor %}
7 | Question: {{test_example['question']}}
8 | Answer: {% if ('answer' in test_example) and (test_example['answer']) %}{{test_example['answer']['aliases'][0]}}{% endif %}
--------------------------------------------------------------------------------
/configs/default_prompts/tweet_emotion.j2:
--------------------------------------------------------------------------------
1 | {%- set dataset_name = "tweet_emotion" -%}
2 | {%- set label_map = {0: 'Anger', 1: 'Joy', 2: 'Optimism', 3: 'Sadness'} -%}
3 | {{instruction}}
4 | {% for demonstration in demonstrations %}
5 | Tweet: {{demonstration['text']}}
6 | Emotion: {{label_map[demonstration['label']]}}
7 | {% endfor %}
8 | Tweet: {{test_example['text']}}
9 | Emotion: {{label_map[test_example['label']]}}
--------------------------------------------------------------------------------
/configs/metric/few_shot_accuracy_defaults.yaml:
--------------------------------------------------------------------------------
1 | few_shot_accuracy:
2 | num_combinations: 30
3 | num_test_instances: 100
4 | num_demonstrations: 6
--------------------------------------------------------------------------------
/configs/metric/permutational_sensitivity_defaults.yaml:
--------------------------------------------------------------------------------
1 | permutational_sensitivity:
2 | num_permutations: 5
3 | num_combinations: 5
4 | num_test_instances: 100
5 | num_demonstrations: 6
--------------------------------------------------------------------------------
/configs/metric/perturbational_accuracy_defaults.yaml:
--------------------------------------------------------------------------------
1 | perturbational_accuracy:
2 | num_combinations: 30
3 | num_test_instances: 100
4 | num_demonstrations: 6
--------------------------------------------------------------------------------
/configs/metric/selectional_sensitivity_defaults.yaml:
--------------------------------------------------------------------------------
1 | selectional_sensitivity:
2 | num_combinations: 30
3 | num_test_instances: 100
4 | num_demonstrations: 6
--------------------------------------------------------------------------------
/configs/metric/zero_shot_accuracy_defaults.yaml:
--------------------------------------------------------------------------------
1 | zero_shot_accuracy:
2 | num_test_instances: 100
--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | from data.ag_news import AGNews
2 | from data.anli import ANLI
3 | from data.boolq import BoolQ
4 | from data.cosmos_qa import CosmosQA
5 | from data.dataset import Dataset
6 | from data.hellaswag import HellaSwag
7 | from data.nq_open import NQOpen
8 | from data.imdb import IMDB
9 | from data.trivia_qa import TriviaQA
10 | from data.tweet_emotion import TweetEmotion
11 |
12 |
13 | def get_dataset(name: str) -> Dataset:
14 | name2dataset = {
15 | "ag_news": AGNews,
16 | "imdb": IMDB,
17 | "anli": ANLI,
18 | "boolq": BoolQ,
19 | "tweet_emotion": TweetEmotion,
20 | "hellaswag": HellaSwag,
21 | "cosmos_qa": CosmosQA,
22 | "nq_open": NQOpen,
23 | "trivia_qa": TriviaQA
24 | }
25 | if not name in name2dataset:
26 | raise KeyError(f"Unrecognized dataset {name}")
27 | return name2dataset[name]()
28 |
--------------------------------------------------------------------------------
/data/ag_news.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class AGNews(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("ag_news")
10 | super().__init__("ag_news", "CLS", dataset["train"], dataset["test"])
11 |
--------------------------------------------------------------------------------
/data/anli.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class ANLI(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("anli")
10 | super().__init__("anli",
11 | "CLS",
12 | dataset["train_r1"],
13 | dataset["test_r1"],
14 | text_keys=["premise", "hypothesis"],
15 | label_key="label")
16 |
--------------------------------------------------------------------------------
/data/boolq.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class BoolQ(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("boolq")
10 | super().__init__("boolq",
11 | "CLS",
12 | dataset["train"],
13 | dataset["validation"],
14 | text_keys=["question", "passage"],
15 | label_key="answer")
16 |
--------------------------------------------------------------------------------
/data/cosmos_qa.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class CosmosQA(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("cosmos_qa")
10 | super().__init__("cosmos_qa",
11 | "MCQ",
12 | dataset["train"],
13 | dataset["validation"],
14 | text_keys=["context", "question"],
15 | label_key="label")
16 |
17 | def get_choices_per_instance(self, instance):
18 | return [
19 | instance['answer0'],
20 | instance['answer1'],
21 | instance['answer2'],
22 | instance['answer3']
23 | ]
24 |
--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | import random
3 | from typing import List, Dict, Any
4 |
5 | class Dataset:
6 |
7 | def __init__(self,
8 | name: str,
9 | task_type: str,
10 | train_split: datasets.Dataset,
11 | test_split: datasets.Dataset,
12 | text_keys: List[str] = ["text"],
13 | label_key: str = "label"):
14 |
15 | """
16 | Parent class for all datasets.
17 |
18 | name: str corresponding to the name of the dataset. (usually matches with filename)
19 | task_type: str corresponding to the type of task. (CLS, MCQ, GQA)
20 | train_split: huggingface dataset corresponding to the training set.
21 | test_split: huggingface dataset corresponding to the testing set.
22 | text_keys: keys representing long text fields.
23 | label_key: key that represents the prediction label.
24 | """
25 |
26 | self.name = name
27 | self.task_type = task_type
28 | self.label_key = label_key
29 | self.text_keys = text_keys
30 | self.splits = {"train": train_split, "test": test_split}
31 |
32 | # get unique classes and indices by class for each split for classification datasets
33 | if self.task_type == "CLS":
34 | self.classes = set(self.splits["train"][self.label_key])
35 | self.num_classes = len(self.classes)
36 | self.idxs_by_class = {split_name: self._get_idxs_by_class(split) for split_name, split in self.splits.items()}
37 |
38 | def _get_idxs_by_class(self, dataset: datasets.Dataset) -> Dict[str, List[int]]:
39 | idxs_by_class = {label: [] for label in self.classes}
40 | for i, label in enumerate(dataset[self.label_key]):
41 | idxs_by_class[label].append(i)
42 | return idxs_by_class
43 |
44 | def sample_instances(self,
45 | split: str,
46 | sample_size: int,
47 | seed: int = 0,
48 | balanced_sampling: bool = True,
49 | max_words: int = 100) -> List[Dict[str, Any]]:
50 |
51 | dataset = self.splits[split]
52 | if sample_size > len(dataset):
53 | raise ValueError(f"Sample size {sample_size} is larger than dataset size {len(dataset)}")
54 |
55 | random.seed(seed)
56 | sampled_indices = []
57 | # balanced sampling is only used for classification datasets
58 | if (self.task_type == "CLS") and balanced_sampling:
59 | dataset_by_label = self.idxs_by_class[split]
60 |
61 | # compute number of instances to sample per class
62 | naive_count_per_class = sample_size // self.num_classes
63 | num_remaining = sample_size % self.num_classes
64 | counts_per_class = \
65 | [naive_count_per_class] * (self.num_classes - num_remaining) + \
66 | [naive_count_per_class + 1] * num_remaining
67 |
68 | # sample instances from each class
69 | for count, label in zip(counts_per_class, self.classes):
70 | if len(dataset_by_label[label]) < count:
71 | raise ValueError(f"Insufficient number of instances for label {label} in split {split}. {len(dataset_by_label[label])} < {count}")
72 | sampled_indices.extend(random.sample(dataset_by_label[label], count))
73 |
74 | # shuffle the sampled indices
75 | random.shuffle(sampled_indices)
76 | else:
77 | sampled_indices.extend(random.sample(range(len(dataset)), sample_size))
78 |
79 | # get instances corresponding to sampled_indices from dataset
80 | sampled_instances = dataset[sampled_indices]
81 | sampled_instances = [dict(zip(sampled_instances, t)) for t in zip(*sampled_instances.values())] # convert to List[Dict]
82 |
83 | # truncate the text field at each instance
84 | if max_words:
85 | for instance in sampled_instances:
86 | for text_key in self.text_keys:
87 | words = instance[text_key].split(" ")
88 | if len(words) > max_words:
89 | instance[text_key] = " ".join(words[:max_words]) + "..."
90 |
91 | return sampled_instances
92 |
--------------------------------------------------------------------------------
/data/hellaswag.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class HellaSwag(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("hellaswag")
10 | super().__init__("hellaswag",
11 | "MCQ",
12 | dataset["train"],
13 | dataset["validation"],
14 | text_keys=["ctx"],
15 | label_key="label")
16 |
17 | def get_choices_per_instance(self, instance):
18 | return instance['endings']
--------------------------------------------------------------------------------
/data/imdb.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | from data.dataset import Dataset
3 |
4 | class IMDB(Dataset):
5 | def __init__(self):
6 | dataset = datasets.load_dataset("imdb", ignore_verifications=True)
7 | super().__init__("imdb", "CLS", dataset["train"], dataset["test"])
8 |
--------------------------------------------------------------------------------
/data/nq_open.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class NQOpen(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("nq_open")
10 | super().__init__("nq_open",
11 | "GQA",
12 | dataset["train"],
13 | dataset["validation"],
14 | text_keys=["question"],
15 | label_key="answer")
16 |
--------------------------------------------------------------------------------
/data/trivia_qa.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class TriviaQA(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("trivia_qa", "rc.web.nocontext")
10 | super().__init__("trivia_qa",
11 | "GQA",
12 | dataset["train"],
13 | dataset["validation"],
14 | text_keys=["question"],
15 | label_key="answer")
16 |
--------------------------------------------------------------------------------
/data/tweet_emotion.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from data.dataset import Dataset
4 |
5 |
6 | class TweetEmotion(Dataset):
7 |
8 | def __init__(self):
9 | dataset = datasets.load_dataset("tweet_eval", "emotion")
10 | super().__init__("tweet_emotion",
11 | "CLS",
12 | dataset["train"],
13 | dataset["test"],
14 | text_keys=["text"],
15 | label_key="label")
16 |
--------------------------------------------------------------------------------
/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | from decoders.constrained_label_generation import ConstrainedLabelGeneration
2 | from decoders.constrained_per_example_label_generation import \
3 | ConstrainedPerExampleLabelGeneration
4 | from decoders.decoder import Decoder
5 | from decoders.greedy_generation import GreedyGeneration
6 | from decoders.nucleus_generation import NucleusGeneration
7 |
--------------------------------------------------------------------------------
/decoders/constrained_label_generation.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, List, Dict, Any, Union, Optional
2 |
3 | import numpy as np
4 | import torch
5 |
6 | from decoders.decoder import Decoder
7 | from models.causal_lm import CausalLM
8 | from templates.few_shot_template import FewShotTemplate
9 |
10 |
11 | class ConstrainedLabelGeneration(Decoder):
12 | """
13 | Decoder that uses the language model to find the lowest perplexity label
14 | from a static set of labels. Ideal for classification tasks with a fixed,
15 | known set of labels.
16 |
17 | Assumes the presence of a label_map in the template which maps huggingface
18 | labels to verbalizer strings. Uses the language model to find the lowest
19 | perplexity verbalizer string among this set.
20 | """
21 |
22 | def __init__(self, template: FewShotTemplate):
23 | super().__init__(template)
24 |
25 |
26 | def decode(
27 | self,
28 | model: CausalLM,
29 | demonstrations: List[Dict[str, Any]],
30 | test_examples: List[Dict[str, Any]],
31 | ) -> List[Dict[str, Any]]:
32 |
33 | """
34 | model: model to use for decoding.
35 | demonstrations: list of in-context demonstrations to use for decoding.
36 | test_examples: list of test examples to decode.
37 | """
38 |
39 | def tokenize(text: Union[str, List[str]]) -> Union[List[int], List[List[int]]]:
40 | return model.tokenizer(text).input_ids
41 |
42 | # get the huggingface labels and the corresponding verbalizers
43 | hf_labels = list(self.template.label_map.keys())
44 | verbalizers = list(self.template.label_map.values())
45 |
46 | # generate prompts for each test example and tokenize them
47 | prompts = [self.template.render(demonstrations, test_example) for test_example in test_examples]
48 | prompt_ids = tokenize(prompts)
49 |
50 | # get the longest common prefix of the prompts. Contains the few-shot demonstrations.
51 | lc_prefix_ids = self._longest_common_prefix(prompt_ids)
52 | past_key_values, past_last_logit = self._get_forward_cache(model, lc_prefix_ids)
53 |
54 | results = []
55 | for prompt in prompts:
56 | # candidate_completions correspond to answered test examples
57 | candidate_answered_prompts = [tokenize(prompt + verbalizer) for verbalizer in verbalizers]
58 | candidate_completions = [candidate_completion[len(lc_prefix_ids):] for candidate_completion in candidate_answered_prompts]
59 |
60 | # find index where the verbalizer begins in the candidate_completions
61 | label_idx = len(tokenize(prompt.rstrip())) - len(lc_prefix_ids)
62 |
63 | # get the perplexities of the verbalizers and compute prediction
64 | verbalizer_perplexities = self._get_verbalizer_perplexities(
65 | model,
66 | candidate_completions,
67 | label_idx,
68 | past_key_values,
69 | past_last_logit,
70 | )
71 | prediction = hf_labels[np.argmin(verbalizer_perplexities)]
72 |
73 | results.append({
74 | "prediction": prediction,
75 | "perplexities": verbalizer_perplexities,
76 | })
77 |
78 | return results
79 |
80 | def _get_forward_cache(
81 | self,
82 | model: CausalLM,
83 | input_ids: List[int],
84 | ) -> Tuple[Optional[Tuple[Tuple[torch.FloatTensor]]], Optional[torch.Tensor]]:
85 | # computes a forward pass on the input_ids, and returns the
86 | # corresponding past_key_values and past_last_logit
87 |
88 | if len(input_ids) == 0:
89 | return None, None
90 |
91 | with torch.no_grad():
92 | input_ids = torch.tensor([input_ids], dtype=int).to(model.device)
93 | model_output = model.hf_model.forward(
94 | input_ids=input_ids,
95 | use_cache=True
96 | )
97 |
98 | past_key_values = model_output["past_key_values"]
99 | past_last_logit = model_output["logits"][:, -1, :]
100 |
101 | return past_key_values, past_last_logit
102 |
103 | def _get_verbalizer_perplexities(
104 | self,
105 | model: CausalLM,
106 | completions: List[List[int]],
107 | label_idx: int,
108 | past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
109 | past_last_logit: Optional[torch.Tensor] = None,
110 | ):
111 |
112 | if (past_key_values is None) ^ (past_last_logit is None):
113 | raise ValueError("Only one of past_key_values and past_last_logit were passed. Expected both or neither.")
114 | if past_last_logit is None:
115 | # dummy past_last_logit if it is not passed. (just to get indexing to line up properly)
116 | past_last_logit = torch.zeros((1, output["logits"].shape[2]), dtype=float).to(model.device)
117 |
118 | perplexities = []
119 | for completion in completions:
120 | with torch.no_grad():
121 | input_ids = torch.tensor([completion], dtype=int).to(model.device)
122 | output = model.hf_model.forward(input_ids=input_ids, past_key_values=past_key_values)
123 |
124 | logits = torch.concat([past_last_logit.unsqueeze(1), output["logits"]], axis=1)[0, :-1, :]
125 | label_ids = input_ids[0, label_idx:]
126 | label_logits = logits[label_idx:, :]
127 |
128 | probs = torch.softmax(label_logits.to(dtype=torch.float32), dim=-1)
129 | token_probs = probs[range(len(label_ids)), label_ids]
130 | perplexities.append(-torch.mean(torch.log(token_probs)).item())
131 |
132 | return perplexities
133 |
134 | def _longest_common_prefix(self, id_lists: List[List[int]]):
135 | if len(id_lists) == 1:
136 | return id_lists[0]
137 | ids_sorted = sorted(id_lists)
138 | first = ids_sorted[0]
139 | last = ids_sorted[-1]
140 | for i in range(min(len(first), len(last))):
141 | if first[i] != last[i]:
142 | return first[:i]
143 | return first if len(first) < len(last) else last
144 |
145 |
--------------------------------------------------------------------------------
/decoders/constrained_per_example_label_generation.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, List, Dict, Any, Union, Optional
2 |
3 | import numpy as np
4 | import torch
5 |
6 | from data.dataset import Dataset
7 | from decoders.decoder import Decoder
8 | from models.causal_lm import CausalLM
9 | from templates.few_shot_template import FewShotTemplate
10 |
11 |
12 | class ConstrainedPerExampleLabelGeneration(Decoder):
13 | """
14 | Decoder that uses the language model to find the lowest perplexity string
15 | from a dynamic set of labels. Meant for MCQ tasks where the label-space
16 | is a function of the test input.
17 |
18 | Ignores the values in the label_map and uses the labels in the test_examples.
19 | Keys in the label_maps should still correspond to the huggingface labels.
20 | Uses the language model to find the lowest perplexity verbalizer string among
21 | the options in the test examples.
22 | """
23 |
24 | def __init__(
25 | self,
26 | template: FewShotTemplate,
27 | dataset: Dataset,
28 | ):
29 | self.dataset = dataset
30 | super().__init__(template)
31 |
32 |
33 | def decode(
34 | self,
35 | model: CausalLM,
36 | demonstrations: List[Dict[str, Any]],
37 | test_examples: List[Dict[str, Any]],
38 | ) -> List[Dict[str, Any]]:
39 |
40 | """
41 | model: model to use for decoding.
42 | demonstrations: list of in-context demonstrations to use for decoding.
43 | test_examples: list of test examples to decode.
44 | """
45 |
46 | def tokenize(text: Union[str, List[str]]) -> Union[List[int], List[List[int]]]:
47 | return model.tokenizer(text).input_ids
48 |
49 | # get the huggingface labels
50 | hf_labels = list(self.template.label_map.keys())
51 |
52 | # generate prompts for each test example and tokenize them
53 | prompts = [self.template.render(demonstrations, test_example) for test_example in test_examples]
54 | prompt_ids = tokenize(prompts)
55 |
56 | # get the longest common prefix of the prompts. Contains the few-shot demonstrations.
57 | lc_prefix_ids = self._longest_common_prefix(prompt_ids)
58 | past_key_values, past_last_logit = self._get_forward_cache(model, lc_prefix_ids)
59 |
60 | results = []
61 | for prompt, test_example in zip(prompts, test_examples):
62 | # candidate_completions correspond to answered test examples
63 | verbalizers = self.dataset.get_choices_per_instance(test_example)
64 | candidate_answered_prompts = [tokenize(prompt + verbalizer) for verbalizer in verbalizers]
65 | candidate_completions = [candidate_completion[len(lc_prefix_ids):] for candidate_completion in candidate_answered_prompts]
66 |
67 | # find index where the verbalizer begins in the candidate_completions
68 | label_idx = len(tokenize(prompt.rstrip())) - len(lc_prefix_ids)
69 |
70 | # get the perplexities of the verbalizers and compute prediction
71 | verbalizer_perplexities = self._get_verbalizer_perplexities(
72 | model,
73 | candidate_completions,
74 | label_idx,
75 | past_key_values,
76 | past_last_logit,
77 | )
78 | prediction = hf_labels[np.argmin(verbalizer_perplexities)]
79 |
80 | results.append({
81 | "prediction": prediction,
82 | "perplexities": verbalizer_perplexities,
83 | })
84 |
85 | return results
86 |
87 | def _get_forward_cache(
88 | self,
89 | model: CausalLM,
90 | input_ids: List[int],
91 | ) -> Tuple[Optional[Tuple[Tuple[torch.FloatTensor]]], Optional[torch.Tensor]]:
92 | # computes a forward pass on the input_ids, and returns the
93 | # corresponding past_key_values and past_last_logit
94 |
95 | if len(input_ids) == 0:
96 | return None, None
97 |
98 | with torch.no_grad():
99 | input_ids = torch.tensor([input_ids], dtype=int).to(model.device)
100 | model_output = model.hf_model.forward(
101 | input_ids=input_ids,
102 | use_cache=True
103 | )
104 |
105 | past_key_values = model_output["past_key_values"]
106 | past_last_logit = model_output["logits"][:, -1, :]
107 |
108 | return past_key_values, past_last_logit
109 |
110 | def _get_verbalizer_perplexities(
111 | self,
112 | model: CausalLM,
113 | completions: List[List[int]],
114 | label_idx: int,
115 | past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
116 | past_last_logit: Optional[torch.Tensor] = None,
117 | ):
118 |
119 | if (past_key_values is None) ^ (past_last_logit is None):
120 | raise ValueError("Only one of past_key_values and past_last_logit were passed. Expected both or neither.")
121 | if past_last_logit is None:
122 | # dummy past_last_logit if it is not passed. (just to get indexing to line up properly)
123 | past_last_logit = torch.zeros((1, output["logits"].shape[2]), dtype=float).to(model.device)
124 |
125 | perplexities = []
126 | for completion in completions:
127 | with torch.no_grad():
128 | input_ids = torch.tensor([completion], dtype=int).to(model.device)
129 | output = model.hf_model.forward(input_ids=input_ids, past_key_values=past_key_values)
130 |
131 | logits = torch.concat([past_last_logit.unsqueeze(1), output["logits"]], axis=1)[0, :-1, :]
132 | label_ids = input_ids[0, label_idx:]
133 | label_logits = logits[label_idx:, :]
134 |
135 | probs = torch.softmax(label_logits.to(dtype=torch.float32), dim=-1)
136 | token_probs = probs[range(len(label_ids)), label_ids]
137 | perplexities.append(-torch.mean(torch.log(token_probs)).item())
138 |
139 | return perplexities
140 |
141 | def _longest_common_prefix(self, id_lists: List[List[int]]):
142 | if len(id_lists) == 1:
143 | return id_lists[0]
144 | ids_sorted = sorted(id_lists)
145 | first = ids_sorted[0]
146 | last = ids_sorted[-1]
147 | for i in range(min(len(first), len(last))):
148 | if first[i] != last[i]:
149 | return first[:i]
150 | return first if len(first) < len(last) else last
151 |
152 |
--------------------------------------------------------------------------------
/decoders/decoder.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Any
2 |
3 | from models.causal_lm import CausalLM
4 | from templates.few_shot_template import FewShotTemplate
5 |
6 |
7 | class Decoder:
8 |
9 | def __init__(self, template: FewShotTemplate):
10 | self.template = template
11 |
12 | def decode(
13 | self,
14 | model: CausalLM,
15 | demonstrations: List[Dict[str, Any]],
16 | test_examples: List[Dict[str, Any]],
17 | ) -> List[dict]:
18 | raise NotImplementedError
19 |
--------------------------------------------------------------------------------
/decoders/greedy_generation.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, List, Optional, Dict, Any, Union
2 |
3 | import torch
4 |
5 | from decoders.decoder import Decoder
6 | from models.causal_lm import CausalLM
7 | from templates.few_shot_template import FewShotTemplate
8 |
9 |
10 | class GreedyGeneration(Decoder):
11 | """
12 | Decoder that uses the language model to greedily generate tokens when
13 | conditioned on the prompt. Meant for generation tasks where the label-space
14 | is unconstrained.
15 |
16 | Does not assume any label_map in the template. Uses only the language model
17 | to generate the output.
18 | """
19 |
20 | def __init__(self, template: FewShotTemplate, max_length: int = 10):
21 | self.max_length = max_length
22 | super().__init__(template)
23 |
24 | def decode(
25 | self,
26 | model: CausalLM,
27 | demonstrations: List[Dict[str, Any]],
28 | test_examples: List[Dict[str, Any]],
29 | ) -> List[Dict[str, Any]]:
30 |
31 | """
32 | model: model to use for decoding.
33 | demonstrations: list of in-context demonstrations to use for decoding.
34 | test_examples: list of test examples to decode.
35 | """
36 |
37 | def tokenize(text: Union[str, List[str]]) -> Union[List[int], List[List[int]]]:
38 | return model.tokenizer(text).input_ids
39 |
40 | # generate prompts for each test example and tokenize them
41 | prompts = [self.template.render(demonstrations, test_example) for test_example in test_examples]
42 | prompt_ids = tokenize(prompts)
43 |
44 | # get the longest common prefix of the prompts. Contains the few-shot demonstrations.
45 | lc_prefix_ids = self._longest_common_prefix(prompt_ids)
46 | past_key_values, _ = self._get_forward_cache(model, lc_prefix_ids)
47 |
48 | results = []
49 | for prompt in prompts:
50 | # find tokens remaining after removing the prefix
51 | input_ids = tokenize(prompt.rstrip())[len(lc_prefix_ids):]
52 |
53 | # generate continuation using greedy sampling
54 | generated_ids = self._greedy_sampling(
55 | model,
56 | input_ids,
57 | max_length=self.max_length,
58 | past_key_values=past_key_values,
59 | )
60 |
61 | # convert generated tokens to text
62 | generated_text = model.tokenizer.decode(generated_ids)
63 | prediction = generated_text.splitlines()[0]
64 | results.append({"prediction": prediction})
65 |
66 | return results
67 |
68 | def _greedy_sampling(
69 | self,
70 | model: CausalLM,
71 | input_ids: List[int],
72 | max_length: int,
73 | past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
74 | ) -> List[int]:
75 | """Generate tokens using greedy sampling."""
76 |
77 | input_ids = torch.tensor([input_ids], dtype=int).to(model.device)
78 | generated_ids = []
79 |
80 | # Generate the next token using greedy sampling
81 | for _ in range(max_length):
82 | with torch.no_grad():
83 | outputs = model.forward(input_ids, past_key_values=past_key_values)
84 | logits = outputs.logits[:, -1, :]
85 | next_token = torch.argmax(logits, dim=-1).unsqueeze(0)
86 |
87 | # Set input_ids to generated token and update past_key_values
88 | input_ids = next_token
89 | past_key_values = outputs.past_key_values
90 |
91 | # Append generated token to the list
92 | generated_ids.append(next_token.squeeze().item())
93 |
94 | # Return generated tokens
95 | return generated_ids
96 |
97 | def _get_forward_cache(
98 | self,
99 | model: CausalLM,
100 | input_ids: List[int],
101 | ) -> Tuple[Optional[Tuple[Tuple[torch.FloatTensor]]], Optional[torch.Tensor]]:
102 | # computes a forward pass on the input_ids, and returns the
103 | # corresponding past_key_values and past_last_logit
104 |
105 | if len(input_ids) == 0:
106 | return None, None
107 |
108 | with torch.no_grad():
109 | input_ids = torch.tensor([input_ids], dtype=int).to(model.device)
110 | model_output = model.hf_model.forward(
111 | input_ids=input_ids,
112 | use_cache=True
113 | )
114 |
115 | past_key_values = model_output["past_key_values"]
116 | past_last_logit = model_output["logits"][:, -1, :]
117 |
118 | return past_key_values, past_last_logit
119 |
120 | def _longest_common_prefix(self, id_lists: List[List[int]]):
121 | if len(id_lists) == 1:
122 | return id_lists[0]
123 | ids_sorted = sorted(id_lists)
124 | first = ids_sorted[0]
125 | last = ids_sorted[-1]
126 | for i in range(min(len(first), len(last))):
127 | if first[i] != last[i]:
128 | return first[:i]
129 | return first if len(first) < len(last) else last
--------------------------------------------------------------------------------
/decoders/nucleus_generation.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import random
3 | import warnings
4 | from typing import Tuple, List, Optional, Any, Union, Dict
5 |
6 | import numpy as np
7 | import torch
8 |
9 | from decoders.decoder import Decoder
10 | from models.causal_lm import CausalLM
11 | from templates.few_shot_template import FewShotTemplate
12 |
13 |
14 | class NucleusGeneration(Decoder):
15 | """
16 | Nucleus Decoding.
17 |
18 | prompts: list of prompts to decode.
19 | dataset: Dataset object containing the task.
20 | model: CausalLM object containing the large language model and tokenizer.
21 | returns the outputs as a list of strings.
22 | """
23 |
24 | def __init__(self, template: FewShotTemplate, max_length: int = 10, temperature: float = 0.7, top_p: float = 0.9):
25 | self.max_length = max_length
26 | self.temperature = temperature
27 | self.top_p = top_p
28 | super().__init__(template)
29 |
30 | def decode(
31 | self,
32 | model: CausalLM,
33 | demonstrations: List[Dict[str, Any]],
34 | test_examples: List[Dict[str, Any]],
35 | ) -> List[Dict[str, Any]]:
36 |
37 | """
38 | model: model to use for decoding.
39 | demonstrations: list of in-context demonstrations to use for decoding.
40 | test_examples: list of test examples to decode.
41 | """
42 |
43 | def tokenize(text: Union[str, List[str]]) -> Union[List[int], List[List[int]]]:
44 | return model.tokenizer(text).input_ids
45 |
46 | # generate prompts for each test example and tokenize them
47 | prompts = [self.template.render(demonstrations, test_example) for test_example in test_examples]
48 | prompt_ids = tokenize(prompts)
49 |
50 | # get the longest common prefix of the prompts. Contains the few-shot demonstrations.
51 | lc_prefix_ids = self._longest_common_prefix(prompt_ids)
52 | past_key_values, _ = self._get_forward_cache(model, lc_prefix_ids)
53 |
54 | results = []
55 | for prompt in prompts:
56 | # find tokens remaining after removing the prefix
57 | input_ids = tokenize(prompt.rstrip())[len(lc_prefix_ids):]
58 |
59 | # generate continuation using nucleus sampling
60 | generated_ids = self._nucleus_sampling(
61 | model,
62 | input_ids,
63 | max_length=self.max_length,
64 | past_key_values=past_key_values,
65 | top_p=self.top_p,
66 | temperature=self.temperature,
67 | )
68 |
69 | # convert generated tokens to text
70 | generated_text = model.tokenizer.decode(generated_ids)
71 | prediction = generated_text.splitlines()[0]
72 | results.append({"prediction": prediction})
73 |
74 | return results
75 |
76 | def _nucleus_sampling(
77 | self,
78 | model: CausalLM,
79 | input_ids: List[int],
80 | max_length: int,
81 | temperature: float,
82 | top_p: float,
83 | past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
84 | ):
85 | """Generate text using nucleus sampling."""
86 |
87 | input_ids = torch.tensor([input_ids], dtype=int).to(model.device)
88 | generated_ids = []
89 |
90 | # Generate the next token using nucleus sampling
91 | for _ in range(max_length):
92 | with torch.no_grad():
93 | outputs = model.forward(input_ids, past_key_values=past_key_values)
94 | logits = outputs.logits[:, -1, :] / temperature
95 | probs = torch.softmax(logits, dim=-1)
96 | sorted_probs, sorted_indices = torch.sort(probs, descending=True)
97 | cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
98 | sorted_indices_to_remove = cumulative_probs > top_p
99 | # to shift mask one step to the right to include the token that first exceeds top_p
100 | sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[..., :-1].clone()
101 | # do not remove the first token even if it alone exceeds top_p
102 | sorted_indices_to_remove[:, 0] = False
103 | indices_to_remove = sorted_indices_to_remove.scatter(
104 | dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
105 | probs[indices_to_remove] = 0
106 | probs /= probs.sum()
107 | next_token = torch.multinomial(probs, num_samples=1)
108 |
109 | # Set input_ids to generated token and update past_key_values
110 | input_ids = next_token
111 | past_key_values = outputs.past_key_values
112 |
113 | # Append generated token to the list
114 | generated_ids.append(next_token.squeeze().item())
115 |
116 | # Return generated tokens
117 | return generated_ids
118 |
119 |
120 | def _get_forward_cache(
121 | self,
122 | model: CausalLM,
123 | input_ids: List[int],
124 | ) -> Tuple[Optional[Tuple[Tuple[torch.FloatTensor]]], Optional[torch.Tensor]]:
125 | # computes a forward pass on the input_ids, and returns the
126 | # corresponding past_key_values and past_last_logit
127 |
128 | if len(input_ids) == 0:
129 | return None, None
130 |
131 | with torch.no_grad():
132 | input_ids = torch.tensor([input_ids], dtype=int).to(model.device)
133 | model_output = model.hf_model.forward(
134 | input_ids=input_ids,
135 | use_cache=True
136 | )
137 |
138 | past_key_values = model_output["past_key_values"]
139 | past_last_logit = model_output["logits"][:, -1, :]
140 |
141 | return past_key_values, past_last_logit
142 |
143 | def _longest_common_prefix(self, id_lists: List[List[int]]):
144 | if len(id_lists) == 1:
145 | return id_lists[0]
146 | ids_sorted = sorted(id_lists)
147 | first = ids_sorted[0]
148 | last = ids_sorted[-1]
149 | for i in range(min(len(first), len(last))):
150 | if first[i] != last[i]:
151 | return first[:i]
152 | return first if len(first) < len(last) else last
--------------------------------------------------------------------------------
/evaluate_instruction.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, Dict, Any
2 | import argparse
3 | import sys
4 |
5 | import yaml
6 |
7 | from data import *
8 | from decoders import *
9 | from metrics import *
10 | from models import *
11 | from templates import *
12 | from utils import *
13 |
14 | def get_metric_name_config(args) -> Tuple[str, Dict[str, Any]]:
15 | with open(args.metric_config, "r") as f:
16 | metric_config = yaml.safe_load(f)
17 | metric_name = list(metric_config.keys())[0]
18 | return metric_name, metric_config
19 |
20 | def get_instruction(args) -> str:
21 | if os.path.isfile(os.path.join(args.instructions_dir, args.dataset + ".yaml")):
22 | # model-agnostic instructions found
23 | instructions_file = os.path.join(args.instructions_dir, args.dataset + ".yaml")
24 | elif os.path.isfile(os.path.join(args.instructions_dir, args.model, args.dataset + ".yaml")):
25 | # model-specific instructions found
26 | instructions_file = os.path.join(args.instructions_dir, args.model, args.dataset + ".yaml")
27 | else:
28 | # no instructions found
29 | raise ValueError(f"No matching instructions file in {args.instructions_dir}")
30 |
31 | with open(instructions_file, "r") as f:
32 | instructions_list = yaml.safe_load(f)
33 | if args.index < 0 or args.index >= len(instructions_list):
34 | raise ValueError(f"Index {args.index} out of bounds for {len(instructions_list)} instructions in {instructions_file}.")
35 | instruction = instructions_list[args.index]
36 | return instruction
37 |
38 | if __name__ == "__main__":
39 |
40 | # parse arguments
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument("--model", type=str, required=True, help="Model name")
43 | parser.add_argument("--dataset", type=str, required=True, help="Dataset name")
44 | parser.add_argument("--decoder", type=str, required=False, help="Decoder name")
45 | parser.add_argument("--metric_config", type=str, required=True, help="Metric config file")
46 | parser.add_argument("--instructions_dir", type=str, required=True, help="Directory containing instruction files")
47 | parser.add_argument( "--index", type=int, required=True, help="Index of instruction to evaluate in dataset's instruction file")
48 | parser.add_argument( "--prompt_template_dir", type=str, required=False, default="configs/default_prompts", help="Directory containing prompt templates for each dataset")
49 | parser.add_argument("--results_dir", type=str, required=False, default="results/", help="Directory to write results to")
50 | args = parser.parse_args()
51 |
52 | # initialize objects
53 | metric_name, metric_config = get_metric_name_config(args)
54 | instruction = get_instruction(args)
55 | prompt_template = InstructionBasedFewShotTemplate(
56 | instruction=instruction,
57 | jinja2_file_path=os.path.join(args.prompt_template_dir, args.dataset + ".j2")
58 | )
59 | dataset_name = prompt_template.dataset_name
60 | model = get_model(args.model)
61 | dataset = get_dataset(dataset_name)
62 | decoder_name = args.decoder or default_decoder_name(dataset.task_type)
63 | decoder = get_decoder(decoder_name, prompt_template, dataset)
64 | metric = get_metric(metric_name, model, dataset, prompt_template, decoder, metric_config)
65 |
66 | metadata_dict = {
67 | "model": args.model,
68 | "dataset": dataset_name,
69 | "metric": metric_name,
70 | "decoder": decoder_name,
71 | "metric_config": metric_config,
72 | "instruction": instruction,
73 | "instructions_dir": args.instructions_dir,
74 | }
75 |
76 | result_filename = get_filename_from_metadata(metadata_dict)
77 | result_path = os.path.join(args.results_dir, result_filename)
78 | if os.path.exists(result_path):
79 | print(f"Results already exist for this configuration at {result_path}.")
80 | print("Exiting...")
81 | else:
82 | # Evaluate metric
83 | inputs = metric.create_inputs()
84 | results = metric.evaluate(inputs)
85 | # Write results to disk
86 | write_results(args.results_dir, result_filename, metadata_dict, results)
87 |
--------------------------------------------------------------------------------
/instructions/ape/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - "classify each input into one of the following categories: World, U.S., Business, Sci/Tech, or Sports."
2 | - "classify the input into one of five categories: World, U.S., Business, Sci/Tech, or Sports."
3 | - "produce an output of 'Sci/Tech' for the first input, 'Business' for the second input, 'World' for the third input, 'Sci/Tech' for the fourth input, and 'Sports' for the fifth input"
4 | - "categorize the inputs into one of five categories: World, Business, Sci/Tech, Sports, or Other."
5 | - "write a program that outputs 'Sci/Tech' if the input is about science or technology, 'Business' if the input is about business, 'World' if the input is about world news, and 'Sports' if the input is"
--------------------------------------------------------------------------------
/instructions/ape/anli.yaml:
--------------------------------------------------------------------------------
1 | - "produce a list of input-output pairs that are Entail, Contradict, or Neutral."
2 | - "produce an entailment if the hypothesis is entailed by the premise, and to produce a neutral if the hypothesis is not entailed by the premise."
3 | - "produce a list of input-output pairs that are either entailments, contradictions, or neutral."
4 | - "produce an entailment if the hypothesis followed logically from the premise, a contradiction if the hypothesis contradicted the premise, and neutral if the hypothesis was not entailed or contradicted by the premise."
5 | - "produce an entailment if the hypothesis is entailed by the premise, a contradiction if the hypothesis contradicts the premise, and neutral if the hypothesis is neither entailed nor contradicted by the premise."
--------------------------------------------------------------------------------
/instructions/ape/boolq.yaml:
--------------------------------------------------------------------------------
1 | - "produce a true or false output based on the question asked about the passage given."
2 | - "produce a true or false statement based on the passage given."
3 | - "produce input-output pairs that were either true or false."
4 | - "produce a true or false output based on whether the passage contains the information required to answer the question."
5 | - "determine whether the statement in the question is true or false based on the information in the passage.s"
--------------------------------------------------------------------------------
/instructions/ape/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - "create a list of input-output pairs."
2 | - "buy a gift for a friend."
3 | - "create an input-output pair based on the given context and question."
4 | - "create input-output pairs based on a given context."
5 | - "provide a list of input-output pairs."
--------------------------------------------------------------------------------
/instructions/ape/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - ":\n\nPick a ripe avocado.\n\nThe person who produced the input-output pairs followed the instruction correctly."
2 | - " fill a 3/4 full tank of water to prevent the hose from freezing. The input-output pairs show that the friend followed the instruction correctly."
3 | - "fill a tank 3/4 of the way with water to prevent the water hose from freezing."
4 | - "create a input-output pair based on the given instruction."
5 | - "file a claim with the small claims clerk."
--------------------------------------------------------------------------------
/instructions/ape/imdb.yaml:
--------------------------------------------------------------------------------
1 | - "produce a list of input-output pairs for a sentiment analysis program."
2 | - "give a positive review if the movie was good and a negative review if the movie was bad."
3 | - "write a positive review, and the output was positive."
4 | - "give a positive review if the input was positive and a negative review if the input was negative."
5 | - "give a review of a movie, and the input-output pairs show reviews that are positive."
--------------------------------------------------------------------------------
/instructions/ape/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - "use Google search."
2 | - "type a question into a search engine and to record the answer that the search engine provides."
3 | - "use Google to find the answer to a question."
4 | - "use Google search to find the answer to the input."
5 | - "'look up the answer to a question on the internet.'"
--------------------------------------------------------------------------------
/instructions/ape/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - "produce input-output pairs for the game show Jeopardy!."
2 | - "Find the Wikipedia page for the thing mentioned in the input, and then look for the first link on that page (not in parentheses)"
3 | - " use Google to find the answer to the question."
4 | - "'name the royal house that Charles VI belonged to.' The input-output pair 'Valois (disambiguation)' is incorrect because it does not name a royal house."
5 | - "produce input-output pairs for the game 20 Questions."
--------------------------------------------------------------------------------
/instructions/ape/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - "list all of the possible emotions that could be conveyed in the inputs. The possible emotions are sadness, anger, and joy."
2 | - "produce an output of 'Joy' for inputs that express positive emotion, 'Sadness' for inputs that express negative emotion, and 'Optimism' for inputs that express a hope for the future."
3 | - "predict the emotion of the user based on the text."
4 | - "identify the emotion being expressed in the input."
5 | - "identify the emotions in the following tweets"
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - How would you categorize this particular news story?
2 | - Which part of a newspaper would you expect this news article to be in - Global News,
3 | Sports, Commerce, or Science and Technology?
4 | - Which newspaper section is most likely to feature this news article?
5 | - How would you categorize this news article?
6 | - Is this news article about World Affairs, Sports, Commerce, or Science and Technology?
7 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is there enough evidence to confirm or reject the hypothesis based on the description?
2 | - Is the hypothesis consistent with what we know about the subject matter and the
3 | information in the passage?
4 | - Does the hypothesis logically follow from the information presented in the passage?
5 | - Given the available information, can we say that the hypothesis is factual or not?
6 | - Does the hypothesis match the information provided in the passage and our general
7 | understanding of the topic?
8 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Assess whether a question is true or false based on the information provided in
2 | the passage.
3 | - Evaluate the accuracy of a question based on the information in the passage.
4 | - Ascertain the validity of a question based on the passage.
5 | - Determine if the answer to a question is true or false based solely on the given
6 | passage.
7 | - Respond to a true/false question by referring to the given passage.
8 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Given the context, determine the most suitable answer to the question.
2 | - Carefully consider the information below and select the most accurate answer to the question.
3 | - Carefully read the following information and provide the answer to the question.
4 | - According to the context provided, pick the correct answer to the question.
5 | - Make use of the given context to answer the question.
6 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Complete the sentence with an appropriate conclusion.
2 | - Fill in the blank to finish the sentence.
3 | - Craft the ending for the sentence in question.
4 | - Write the final part of the sentence.
5 | - Close the description appropriately.
6 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/imdb.yaml:
--------------------------------------------------------------------------------
1 | - What is the writer's impression of the film?
2 | - What is the general mood of the movie review?
3 | - Does the review give a positive or negative assessment of the film?
4 | - How does the writer feel about the movie?
5 | - Does the reviewer recommend this movie?
6 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Anticipate a concise answer for the following queries.
2 | - 'I have long wondered:'
3 | - Give an answer to the following question.
4 | - Respond to the subsequent question.
5 | - 'I have often pondered:'
6 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - Predict a brief response for the following inquiries.
2 | - Reply to each question with a brief statement.
3 | - 'I have always pondered:'
4 | - The purpose is to anticipate an English answer for the given English inquiry.
5 | - 'I have been pondering this for a while:'
6 |
--------------------------------------------------------------------------------
/instructions/chat_gpt_prompts/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To achieve a high score, select the correct emotion from the options provided:
2 | anger, joy, optimism, sadness.'
3 | - Which sentiment, anger, joy, optimism, or sadness, accurately characterizes the
4 | emotion of the individual who authored the tweet below?
5 | - Determine the sentiment that is most accurately portrayed in the following tweet.
6 | - Which of the four emotions (anger, joy, optimism, sadness) is best reflected in
7 | the tweet below?
8 | - Which of the following choices (anger, joy, optimism, or sadness) best matches the
9 | emotion depicted in the tweet?
10 |
--------------------------------------------------------------------------------
/instructions/generic_instruction/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/anli.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/boolq.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/imdb.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/generic_instruction/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - "Solve the following task:"
2 | - "Find the answer below:"
3 | - "Complete the problem."
4 | - "Find the best solution to the question below:"
5 | - "Complete the question below:"
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
2 | Executive, or a Scientist?'
3 | - 'Who would you advise to read this article: a Politician, an Athlete, a Business
4 | leader, or a Scientist?'
5 | - Which section of a newspaper would be the best fit for this news article - World
6 | News, Sports, Business or Science and Technology?
7 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
8 | Business or Science and Technology?
9 | - 'Which group would benefit from reading this article: a Politician, Athlete, Business
10 | Executive, or Scientist?'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
2 | - Can we conclude that the hypothesis is true, false, or uncertain based on the available
3 | information?
4 | - Is it safe to say that the hypothesis is true based on the given data?
5 | - Is it reasonable to make the claim that the hypothesis is true based on the available
6 | evidence?
7 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
8 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Check whether a question can be answered using the information in the passage, and
4 | if so, determine if the answer is true or false.
5 | - Assess whether the question can be answered using only the information in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Verify if the answer to a question can be deduced from the passage, and determine
8 | if it is true or false.
9 | - Ascertain if a given question can be answered using the information presented in
10 | the passage, and if so, evaluate if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Based on the information provided below, decide on the correct answer to the question.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Read the text below and select the appropriate answer to the question.
4 | - Carefully consider the given information and select the most appropriate answer to the question.
5 | - Your objective is to read the context and select the appropriate answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Complete the sentence with a suitable ending phrase.
2 | - What is the likely conclusion of this sentence?
3 | - Fill in the blank to finish the sentence.
4 | - Provide an appropriate ending to the sentence.
5 | - Fill in the missing words to conclude the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/imdb.yaml:
--------------------------------------------------------------------------------
1 | - What was the movie critic's reaction to the film?
2 | - Does the reviewer have a positive or negative view of the movie?
3 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
4 | - Was the movie reviewer's overall opinion of the movie favorable or unfavorable?
5 | - What is the movie critic's opinion of the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Your task is to generate an English response for an English question, referring
2 | to English Wikipedia content.
3 | - The objective is to generate an English response for an English question by consulting
4 | English Wikipedia.
5 | - Your objective is to predict an English answer for an English query, using information
6 | sourced from English Wikipedia.
7 | - You are required to generate an English response for an English question, with reference
8 | to English Wikipedia.
9 | - The aim is to predict an English response based on an English query, using information
10 | from English Wikipedia.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - I've had a long-standing question that needs an answer.
4 | - Provide a brief answer for the following questions.
5 | - Provide a brief response to the following questions.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b1/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
2 | describes the sentiment expressed in the tweet.
3 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
4 | most closely matches the sentiment expressed in the tweet.
5 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
6 | fit for the sentiment conveyed in the tweet?
7 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
8 | in the tweet from the following options: anger, joy, optimism, or sadness.'
9 | - From the list of emotions provided, anger, joy, optimism, or sadness, select the
10 | one that accurately characterizes the author's emotions in the tweet.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
2 | Executive, or a Scientist?'
3 | - 'Who would you advise to read this article: a Politician, an Athlete, a Business
4 | leader, or a Scientist?'
5 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
6 | Business or Science and Technology?
7 | - In which category of a newspaper is this article likely to be placed? World News,
8 | Sports, Business or Science and Technology?
9 | - In which part of a newspaper is this article likely to be published? World News,
10 | Sports, Business or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/anli.yaml:
--------------------------------------------------------------------------------
1 | - Does the hypothesis logically follow from the premise?
2 | - Is the hypothesis a logical deduction from the premise?
3 | - Does the premise support the hypothesis?
4 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
5 | - Is it safe to say that the hypothesis is true based on the given data?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Assess whether the question can be answered using only the information in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Check whether a question can be answered using the information in the passage, and
6 | if so, determine if the answer is true or false.
7 | - Determine whether a given question can be answered using information from the passage,
8 | and if so, determine if the answer is true or false.
9 | - Ascertain if a given question can be answered using the information presented in
10 | the passage, and if so, evaluate if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Based on the information given below, determine the correct answer to the question.
2 | - Based on the information provided below, decide on the correct answer to the question.
3 | - Read the text below and select the appropriate answer to the question.
4 | - Carefully consider the information below and select the most accurate answer to the question.
5 | - Carefully consider the given information and select the most appropriate answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the missing words to conclude the sentence.
2 | - Provide an appropriate ending to the sentence.
3 | - Fill in the blank to finish the sentence.
4 | - Write the last few words to complete the sentence.
5 | - Find an appropriate way to end the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - What was the movie critic's reaction to the film?
3 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
4 | - Did the reviewer have a positive or negative experience watching the movie?
5 | - Was the reviewer satisfied or dissatisfied with the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Respond to the following prompt with an answer.
2 | - The objective is to generate an English response for an English question by consulting
3 | English Wikipedia.
4 | - Your task is to generate an English response for an English question, referring
5 | to English Wikipedia content.
6 | - Each question should be answered succinctly and precisely.
7 | - Provide a response to the following prompt.
8 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - Provide a brief answer for the following questions.
4 | - Provide a brief response to the following questions.
5 | - Answer the following question with an appropriate response.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom1b7/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
2 | describes the sentiment expressed in the tweet.
3 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
4 | most closely matches the sentiment expressed in the tweet.
5 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
6 | fit for the sentiment conveyed in the tweet?
7 | - Indicate which of the emotions listed below, anger, joy, optimism, or sadness, is
8 | most appropriate for the tweet.
9 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
10 | in the tweet from the following options: anger, joy, optimism, or sadness.'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Which section of a newspaper would be the best fit for this news article - World
2 | News, Sports, Business or Science and Technology?
3 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
4 | Executive, or a Scientist?'
5 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
6 | Business or Science and Technology?
7 | - In which category of a newspaper is this article likely to be placed? World News,
8 | Sports, Business or Science and Technology?
9 | - In which part of a newspaper is this article likely to be published? World News,
10 | Sports, Business or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/anli.yaml:
--------------------------------------------------------------------------------
1 | - Does the hypothesis logically follow from the premise?
2 | - Does the premise support the hypothesis?
3 | - Can we conclude that the hypothesis is true, false, or uncertain based on the available
4 | information?
5 | - Is it safe to say that the hypothesis is true based on the given data?
6 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Assess whether the question can be answered using only the information in the passage,
2 | and if so, determine if the answer is true or false.
3 | - Determine whether the answer to a question can be found in the passage, and if so,
4 | decide if the answer is true or false.
5 | - Check whether a question can be answered using the information in the passage, and
6 | if so, determine if the answer is true or false.
7 | - Determine whether a given question can be answered using information from the passage,
8 | and if so, determine if the answer is true or false.
9 | - Ascertain if a given question can be answered using the information presented in
10 | the passage, and if so, evaluate if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Based on the information provided below, decide on the correct answer to the question.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Utilizing the information provided in the context, provide an answer to the question.
4 | - Carefully consider the given information and select the most appropriate answer to the question.
5 | - Read the text below and select the appropriate answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Predict the final words of this sentence.
2 | - Fill in the blank to finish the sentence.
3 | - Fill in the blank with the correct ending.
4 | - Write the final part of the sentence.
5 | - Fill in the missing words to conclude the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - Was the movie reviewer's overall opinion of the movie favorable or unfavorable?
3 | - Did the reviewer have a positive or negative experience watching the movie?
4 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
5 | - Is the reviewer's evaluation of the movie favorable or unfavorable?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Respond to the following prompt with an answer.
2 | - Each question should be answered succinctly and precisely.
3 | - The objective is to generate an English response for an English question by consulting
4 | English Wikipedia.
5 | - Provide a response to the following prompt.
6 | - You are required to generate an English response for an English question, with reference
7 | to English Wikipedia.
8 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - Answer the following question with an appropriate response.
4 | - Provide a brief answer for the following questions.
5 | - Respond to the subsequent question with an appropriate answer.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom3b/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
2 | describes the sentiment expressed in the tweet.
3 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
4 | fit for the sentiment conveyed in the tweet?
5 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
6 | most closely matches the sentiment expressed in the tweet.
7 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
8 | in the tweet from the following options: anger, joy, optimism, or sadness.'
9 | - From the list of emotions provided, anger, joy, optimism, or sadness, select the
10 | one that accurately characterizes the author's emotions in the tweet.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Which section of a newspaper would be the best fit for this news article - World
2 | News, Sports, Business or Science and Technology?
3 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
4 | Executive, or a Scientist?'
5 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
6 | Business or Science and Technology?
7 | - In which part of a newspaper is this article likely to be published? World News,
8 | Sports, Business or Science and Technology?
9 | - In which part of a newspaper would this article most likely be included? World News,
10 | Sports, Business, or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/anli.yaml:
--------------------------------------------------------------------------------
1 | - Does the hypothesis logically follow from the premise?
2 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
3 | - Is it safe to say that the hypothesis is true based on the given data?
4 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
5 | - Does the premise support the hypothesis?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Assess whether the question can be answered using only the information in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Determine whether a given question can be answered using information from the passage,
6 | and if so, determine if the answer is true or false.
7 | - Check whether a question can be answered using the information in the passage, and
8 | if so, determine if the answer is true or false.
9 | - Evaluate if a question can be answered using the information provided in the passage,
10 | and if so, determine if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Based on the information given below, determine the correct answer to the question.
2 | - Based on the information provided below, decide on the correct answer to the question.
3 | - Read the text below and select the appropriate answer to the question.
4 | - Carefully read the following information and provide the answer to the question.
5 | - Carefully consider the given information and select the most appropriate answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Fill in the missing words to conclude the sentence.
3 | - Predict the final words of this sentence.
4 | - Fill in the blank with the correct ending.
5 | - Provide an appropriate ending to the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
2 | - Was the movie reviewer's overall opinion of the movie favorable or unfavorable?
3 | - Is the sentiment conveyed by the author of this movie review positive or negative?
4 | - Was the reviewer satisfied or dissatisfied with the movie?
5 | - Does the reviewer have a positive or negative view of the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Your objective is to predict an English answer for an English query, using information
2 | sourced from English Wikipedia.
3 | - Each question should be answered succinctly and precisely.
4 | - Respond to the following prompt with an answer.
5 | - The aim is to predict an English response based on an English query, using information
6 | from English Wikipedia.
7 | - The objective is to generate an English response for an English question by consulting
8 | English Wikipedia.
9 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Provide a brief answer for the following questions.
3 | - Respond to the subsequent question with an appropriate answer.
4 | - Please provide an answer to the question below.
5 | - Answer the following question with an appropriate response.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/bloom7b1/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
2 | describes the sentiment expressed in the tweet.
3 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
4 | most closely matches the sentiment expressed in the tweet.
5 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
6 | fit for the sentiment conveyed in the tweet?
7 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
8 | in the tweet from the following options: anger, joy, optimism, or sadness.'
9 | - Identify the emotion that the tweet conveys from the given options of anger, joy,
10 | optimism, or sadness.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
2 | Executive, or a Scientist?'
3 | - Which section of a newspaper would be the best fit for this news article - World
4 | News, Sports, Business or Science and Technology?
5 | - 'Who would you advise to read this article: a Politician, an Athlete, a Business
6 | leader, or a Scientist?'
7 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
8 | Business or Science and Technology?
9 | - Does this news story pertain to World Politics, Sports, Business, or Science and
10 | Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
2 | - Can we conclude that the hypothesis is true, false, or uncertain based on the available
3 | information?
4 | - Is the hypothesis supported by the available evidence?
5 | - Is it safe to say that the hypothesis is true based on the given data?
6 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Evaluate if a question can be answered using the information provided in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Check whether a question can be answered using the information in the passage, and
6 | if so, determine if the answer is true or false.
7 | - Determine whether a given question can be answered using information from the passage,
8 | and if so, determine if the answer is true or false.
9 | - Assess whether the question can be answered using only the information in the passage,
10 | and if so, determine if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Carefully consider the given information and select the most appropriate answer to the question.
2 | - Carefully consider the information below and select the most accurate answer to the question.
3 | - Based on the information given below, determine the correct answer to the question.
4 | - Select the most appropriate response to the question based on the information provided below.
5 | - Based on the information provided below, decide on the correct answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Write the last few words to complete the sentence.
3 | - Fill in the blank with the correct ending.
4 | - Fill in the missing words to conclude the sentence.
5 | - Write the final part of the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - What is the movie critic's opinion of the movie?
3 | - What is the reviewer's take on the movie?
4 | - How does the reviewer feel about the movie?
5 | - Did the reviewer have a positive or negative experience watching the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Provide a response to the following prompt.
2 | - Each question should be answered succinctly and precisely.
3 | - Please answer the following question.
4 | - Each question must be answered with a concise reply.
5 | - Please provide a response to the following inquiry.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - Provide a brief response to the following questions.
4 | - Provide a brief answer for the following questions.
5 | - I've had a long-standing question that needs an answer.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo1b3/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
2 | describes the sentiment expressed in the tweet.
3 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
4 | most closely matches the sentiment expressed in the tweet.
5 | - 'To score the maximum marks, categorize the tweet into one of the following emotions:
6 | anger, joy, optimism, or sadness.'
7 | - 'To get a perfect score, select the correct emotion from the following options:
8 | anger, joy, optimism, or sadness.'
9 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
10 | in the tweet from the following options: anger, joy, optimism, or sadness.'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Which section of a newspaper would be the best fit for this news article - World
2 | News, Sports, Business or Science and Technology?
3 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
4 | Executive, or a Scientist?'
5 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
6 | Business or Science and Technology?
7 | - 'Who would you advise to read this article: a Politician, an Athlete, a Business
8 | leader, or a Scientist?'
9 | - In which category of a newspaper is this article likely to be placed? World News,
10 | Sports, Business or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
2 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
3 | - Does the hypothesis logically follow from the premise?
4 | - Can we conclude that the hypothesis is true, false, or uncertain based on the available
5 | information?
6 | - Is it reasonable to make the claim that the hypothesis is true based on the available
7 | evidence?
8 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Evaluate if a question can be answered using the information provided in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Assess whether the question can be answered using only the information in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Check whether a question can be answered using the information in the passage, and
8 | if so, determine if the answer is true or false.
9 | - Determine whether a given question can be answered using information from the passage,
10 | and if so, determine if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Carefully consider the given information and select the most appropriate answer to the question.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Carefully consider the information below and select the most accurate answer to the question.
4 | - Based on the information provided below, decide on the correct answer to the question.
5 | - Utilizing the information provided in the context, provide an answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Fill in the missing words to conclude the sentence.
3 | - Fill in the blank with the correct ending.
4 | - Provide an appropriate ending to the sentence.
5 | - Write the last few words to complete the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/imdb.yaml:
--------------------------------------------------------------------------------
1 | - What is the sentiment expressed in this movie review?
2 | - Does the reviewer have a positive or negative view of the movie?
3 | - Did the reviewer have a positive or negative experience watching the movie?
4 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
5 | - Was the reviewer's evaluation of the movie positive or negative?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Respond to the following prompt with an answer.
2 | - Reply to the following prompt with an answer.
3 | - Provide a response to the following prompt.
4 | - Please provide a response to the following inquiry.
5 | - Answer the following prompt with a reply.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Answer the following question with an appropriate response.
3 | - Please provide an answer to the question below.
4 | - I've had a long-standing question that needs an answer.
5 | - Respond to the following question with a brief expression.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneo2b7/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
2 | in the tweet from the following options: anger, joy, optimism, or sadness.'
3 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
4 | describes the sentiment expressed in the tweet.
5 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
6 | most closely matches the sentiment expressed in the tweet.
7 | - Indicate which of the emotions listed below, anger, joy, optimism, or sadness, is
8 | most appropriate for the tweet.
9 | - 'To score the maximum marks, categorize the tweet into one of the following emotions:
10 | anger, joy, optimism, or sadness.'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Which section of a newspaper would be the best fit for this news article - World
2 | News, Sports, Business or Science and Technology?
3 | - In which category of a newspaper is this article likely to be placed? World News,
4 | Sports, Business or Science and Technology?
5 | - In which part of a newspaper would this article most likely be included? World News,
6 | Sports, Business, or Science and Technology?
7 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
8 | Business or Science and Technology?
9 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
10 | Executive, or a Scientist?'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/anli.yaml:
--------------------------------------------------------------------------------
1 | - Does the hypothesis logically follow from the premise?
2 | - Based on the given information, can we affirm the hypothesis as correct, incorrect,
3 | or undetermined?
4 | - Does the hypothesis follow from the given premise?
5 | - Is it safe to say that the hypothesis is true based on the given data?
6 | - Does the hypothesis make sense in light of the given data?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Evaluate if a question can be answered using the information provided in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Assess whether the question can be answered using only the information in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Check whether a question can be answered using the information in the passage, and
8 | if so, determine if the answer is true or false.
9 | - Check if the answer to a question is supported by the information in the passage,
10 | and determine if it is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Based on the information given below, determine the correct answer to the question.
2 | - Carefully consider the given information and select the most appropriate answer to the question.
3 | - Based on the information provided below, decide on the correct answer to the question.
4 | - Utilizing the information provided in the context, provide an answer to the question.
5 | - Read the text below and select the appropriate answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank with the correct ending.
2 | - Fill in the blank to finish the sentence.
3 | - Fill in the missing words to conclude the sentence.
4 | - Predict the final words of this sentence.
5 | - Predict the ending for this sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
3 | - Did the movie critic have a good or bad impression of the movie?
4 | - Was the movie reviewer's overall opinion of the movie favorable or unfavorable?
5 | - Was the reviewer's evaluation of the movie positive or negative?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Provide a response to the following prompt.
2 | - Please answer the following question.
3 | - Each question should be answered succinctly and precisely.
4 | - Respond to the following prompt with an answer.
5 | - Please provide a response to the following inquiry.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - Please provide an answer to the question below.
2 | - There is a query that has been on my mind for some time now.
3 | - Answer the following question with an appropriate response.
4 | - Provide a brief response to the following questions.
5 | - Provide a brief answer for the following questions.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/gptneox20b/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
2 | describes the sentiment expressed in the tweet.
3 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
4 | most closely matches the sentiment expressed in the tweet.
5 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
6 | in the tweet from the following options: anger, joy, optimism, or sadness.'
7 | - 'To score the maximum marks, categorize the tweet into one of the following emotions:
8 | anger, joy, optimism, or sadness.'
9 | - Indicate which of the emotions listed below, anger, joy, optimism, or sadness, is
10 | most appropriate for the tweet.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - In which part of a newspaper is this article likely to be published? World News,
2 | Sports, Business or Science and Technology?
3 | - Which section of a newspaper would be the best fit for this news article - World
4 | News, Sports, Business or Science and Technology?
5 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
6 | Executive, or a Scientist?'
7 | - In which category of a newspaper is this article likely to be placed? World News,
8 | Sports, Business or Science and Technology?
9 | - In which part of a newspaper would this article most likely be included? World News,
10 | Sports, Business, or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/anli.yaml:
--------------------------------------------------------------------------------
1 | - Does the premise support the hypothesis?
2 | - Does the hypothesis logically follow from the premise?
3 | - Is the hypothesis a logical deduction from the premise?
4 | - Does the hypothesis follow from the given premise?
5 | - Is the hypothesis consistent with the given information?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Evaluate whether a statement is true or false based on the information provided
2 | in the passage.
3 | - Evaluate if a question can be answered using the information provided in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Determine if a statement can be verified as true or false based on the information
6 | in the passage.
7 | - Decide if a true/false question can be answered using only the information provided
8 | in the passage.
9 | - Determine if the answer to a true/false question can be found in the passage and
10 | evaluate its truthfulness.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Select the most appropriate response to the question based on the information provided below.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Based on the information provided below, decide on the correct answer to the question.
4 | - Your task is to read the context and determine the answer to the question.
5 | - Read the text below and select the appropriate answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Fill in the blank with the correct ending.
3 | - What is the likely conclusion of this sentence?
4 | - Fill in the missing words to conclude the sentence.
5 | - Add the final words to complete the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Was the movie reviewer's overall opinion of the movie favorable or unfavorable?
2 | - Does the reviewer have a positive or negative view of the movie?
3 | - Is the reviewer's evaluation of the movie favorable or unfavorable?
4 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
5 | - What is the reviewer's take on the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Please provide a response to the following inquiry.
2 | - Please answer the following question.
3 | - Each question should be answered succinctly and precisely.
4 | - Your objective is to predict an English answer for an English query, using information
5 | sourced from English Wikipedia.
6 | - Please reply to the following question.
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - Please provide an answer to the question below.
2 | - Answer the following question with an appropriate response.
3 | - There is a query that has been on my mind for some time now.
4 | - Please provide a response to the following inquiry.
5 | - I've had a long-standing question that needs an answer.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama13b/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
2 | in the tweet from the following options: anger, joy, optimism, or sadness.'
3 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
4 | fit for the sentiment conveyed in the tweet?
5 | - From the list of emotions provided, anger, joy, optimism, or sadness, select the
6 | one that accurately characterizes the author's emotions in the tweet.
7 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
8 | describes the sentiment expressed in the tweet.
9 | - Among the emotions of anger, joy, optimism, and sadness, pick the one that best
10 | suits the author's emotions in the tweet.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - In which part of a newspaper is this article likely to be published? World News,
2 | Sports, Business or Science and Technology?
3 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
4 | Executive, or a Scientist?'
5 | - Which section of a newspaper would be the best fit for this news article - World
6 | News, Sports, Business or Science and Technology?
7 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
8 | Business or Science and Technology?
9 | - In which category of a newspaper is this article likely to be placed? World News,
10 | Sports, Business or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/anli.yaml:
--------------------------------------------------------------------------------
1 | - Does the hypothesis logically follow from the premise?
2 | - Does the premise support the hypothesis?
3 | - Is the hypothesis a logical deduction from the premise?
4 | - Can we conclude that the hypothesis is true, false, or uncertain based on the available
5 | information?
6 | - Does the hypothesis follow from the given premise?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Determine whether a given question can be answered using information from the passage,
4 | and if so, determine if the answer is true or false.
5 | - Assess whether the question can be answered using only the information in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Evaluate if a question can be answered using the information provided in the passage,
8 | and if so, determine if the answer is true or false.
9 | - Evaluate whether a statement is true or false based on the information provided
10 | in the passage.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Select the most appropriate response to the question based on the information provided below.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Based on the information provided below, decide on the correct answer to the question.
4 | - Your task is to read the context and determine the answer to the question.
5 | - Examine the provided context and choose the best answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Fill in the blank with the correct ending.
3 | - Fill in the missing words to conclude the sentence.
4 | - Provide an appropriate ending to the sentence.
5 | - Add the ending that brings the sentence to a close.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the movie review express a favorable or unfavorable sentiment toward the movie?
2 | - Does the reviewer have a positive or negative view of the movie?
3 | - Was the movie reviewer's overall opinion of the movie favorable or unfavorable?
4 | - Is the reviewer's evaluation of the movie favorable or unfavorable?
5 | - What is the reviewer's emotional response to the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Each question should be answered succinctly and precisely.
2 | - Please provide a response to the following inquiry.
3 | - Each question should be answered succinctly.
4 | - Provide a response to the following prompt.
5 | - Each question must be answered with a concise reply.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - Answer the following question with an appropriate response.
4 | - Provide a brief response to the following questions.
5 | - Provide a brief answer for the following questions.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/llama7b/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
2 | fit for the sentiment conveyed in the tweet?
3 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
4 | in the tweet from the following options: anger, joy, optimism, or sadness.'
5 | - Identify the emotion that the tweet conveys from the given options of anger, joy,
6 | optimism, or sadness.
7 | - From the list of emotions provided, anger, joy, optimism, or sadness, select the
8 | one that accurately characterizes the author's emotions in the tweet.
9 | - 'Identify the emotion best suited to describe the content of the following tweet:
10 | anger, joy, optimism, or sadness?'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
2 | Executive, or a Scientist?'
3 | - In which part of a newspaper is this article likely to be published? World News,
4 | Sports, Business or Science and Technology?
5 | - In which category of a newspaper is this article likely to be placed? World News,
6 | Sports, Business or Science and Technology?
7 | - Which section of a newspaper would be the best fit for this news article - World
8 | News, Sports, Business or Science and Technology?
9 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
10 | Business or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
2 | - Is it reasonable to make the claim that the hypothesis is true based on the available
3 | evidence?
4 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
5 | - Can we infer that the hypothesis is true from the information provided?
6 | - Is it safe to say that the hypothesis is true based on the given data?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Evaluate if a question can be answered using the information provided in the passage,
2 | and if so, determine if the answer is true or false.
3 | - Determine whether the answer to a question can be found in the passage, and if so,
4 | decide if the answer is true or false.
5 | - Assess whether the question can be answered using only the information in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Determine whether a given question can be answered using information from the passage,
8 | and if so, determine if the answer is true or false.
9 | - Evaluate whether a statement is true or false based on the information provided
10 | in the passage.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Evaluate the context below and choose the answer that best fits the question.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Read the text below and select the appropriate answer to the question.
4 | - Select the most appropriate response to the question based on the information provided below.
5 | - Utilizing the information provided in the context, provide an answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the missing words to conclude the sentence.
2 | - Fill in the blank with the correct ending.
3 | - Fill in the blank to finish the sentence.
4 | - Write the last few words to complete the sentence.
5 | - What is the likely conclusion of this sentence?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - Did the reviewer have a positive or negative experience watching the movie?
3 | - Would you classify this review as positive or negative?
4 | - Is the reviewer's opinion of the movie positive or negative?
5 | - What is the reviewer's take on the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Please answer the following question.
2 | - Respond to the following prompt with an answer.
3 | - Please provide a response to the following inquiry.
4 | - Each question must be answered with a concise reply.
5 | - Each question should be answered succinctly and precisely.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - Answer the following question with an appropriate response.
4 | - I've had a long-standing question that needs an answer.
5 | - Provide a brief response to the following questions.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt13b/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
2 | in the tweet from the following options: anger, joy, optimism, or sadness.'
3 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
4 | describes the sentiment expressed in the tweet.
5 | - 'To get a perfect score, select the correct emotion from the following options:
6 | anger, joy, optimism, or sadness.'
7 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
8 | fit for the sentiment conveyed in the tweet?
9 | - 'Identify the emotion best suited to describe the content of the following tweet:
10 | anger, joy, optimism, or sadness?'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
2 | Business or Science and Technology?
3 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
4 | Executive, or a Scientist?'
5 | - Which section of a newspaper would be the best fit for this news article - World
6 | News, Sports, Business or Science and Technology?
7 | - In which category of a newspaper is this article likely to be placed? World News,
8 | Sports, Business or Science and Technology?
9 | - 'Who would you advise to read this article: a Politician, an Athlete, a Business
10 | leader, or a Scientist?'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it reasonable to make the claim that the hypothesis is true based on the available
2 | evidence?
3 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
4 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
5 | - Is the hypothesis supported by the available evidence?
6 | - Is it safe to say that the hypothesis is true based on the given data?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Evaluate if a question can be answered using the information provided in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Assess whether the question can be answered using only the information in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Determine whether a given question can be answered using information from the passage,
8 | and if so, determine if the answer is true or false.
9 | - Ascertain if a given question can be answered using the information presented in
10 | the passage, and if so, evaluate if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Select the most appropriate response to the question based on the information provided below.
2 | - Familiarize yourself with the provided information and provide the answer to the question.
3 | - Based on the information provided below, decide on the correct answer to the question.
4 | - Based on the information given below, determine the correct answer to the question.
5 | - Evaluate the context below and choose the answer that best fits the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Fill in the blank with the correct ending.
3 | - Fill in the missing words to conclude the sentence.
4 | - Find an appropriate way to end the sentence.
5 | - Your objective is to create the concluding part of the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - Did the reviewer have a positive or negative experience watching the movie?
3 | - How did the reviewer feel about the movie?
4 | - Did the movie critic have a good or bad impression of the movie?
5 | - What is the reviewer's take on the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Respond to the following prompt with an answer.
2 | - Please answer the following question.
3 | - Please reply to the following question.
4 | - Each question must be answered with a concise reply.
5 | - Please provide a response to the following inquiry.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - I've had a long-standing question that needs an answer.
4 | - Provide a brief response to the following questions.
5 | - Respond to the subsequent question with an appropriate answer.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt1b3/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
2 | in the tweet from the following options: anger, joy, optimism, or sadness.'
3 | - 'Categorize the tweet into one of the following emotions to receive full marks:
4 | anger, joy, optimism, or sadness.'
5 | - 'To get a perfect score, select the correct emotion from the following options:
6 | anger, joy, optimism, or sadness.'
7 | - 'Categorize the tweet into one of the following emotions: anger, joy, optimism,
8 | or sadness, to achieve maximum marks.'
9 | - 'To score the maximum marks, categorize the tweet into one of the following emotions:
10 | anger, joy, optimism, or sadness.'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
2 | Executive, or a Scientist?'
3 | - Which section of a newspaper would be the best fit for this news article - World
4 | News, Sports, Business or Science and Technology?
5 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
6 | Business or Science and Technology?
7 | - In which category of a newspaper is this article likely to be placed? World News,
8 | Sports, Business or Science and Technology?
9 | - 'Who would you advise to read this article: a Politician, an Athlete, a Business
10 | leader, or a Scientist?'
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
2 | - Is it reasonable to make the claim that the hypothesis is true based on the available
3 | evidence?
4 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
5 | - Is it safe to say that the hypothesis is true based on the given data?
6 | - Can we conclude that the hypothesis is true, false, or uncertain based on the available
7 | information?
8 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Assess whether the question can be answered using only the information in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Evaluate if a question can be answered using the information provided in the passage,
6 | and if so, determine if the answer is true or false.
7 | - Ascertain if a given question can be answered using the information presented in
8 | the passage, and if so, evaluate if the answer is true or false.
9 | - Determine whether a given question can be answered using information from the passage,
10 | and if so, determine if the answer is true or false.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Select the most appropriate response to the question based on the information provided below.
2 | - Based on the information given below, determine the correct answer to the question.
3 | - Read the text below and select the appropriate answer to the question.
4 | - Familiarize yourself with the provided information and provide the answer to the question.
5 | - Based on the information provided below, decide on the correct answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank with the correct ending.
2 | - Fill in the blank to finish the sentence.
3 | - Fill in the missing words to conclude the sentence.
4 | - Predict the final words of this sentence.
5 | - What is the likely conclusion of this sentence?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Does the reviewer have a positive or negative view of the movie?
2 | - Did the reviewer have a positive or negative experience watching the movie?
3 | - Is the reviewer's opinion of the movie positive or negative?
4 | - Is the sentiment conveyed by the author of this movie review positive or negative?
5 | - Would you classify this review as positive or negative?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Respond to the following prompt with an answer.
2 | - Please answer the following question.
3 | - Please provide a response to the following inquiry.
4 | - Provide a response to the following prompt.
5 | - Please reply to the following question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - I've had a long-standing question that needs an answer.
4 | - Provide a brief response to the following questions.
5 | - Answer the following question with an appropriate response.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt2b7/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
2 | in the tweet from the following options: anger, joy, optimism, or sadness.'
3 | - Among the emotions of anger, joy, optimism, and sadness, select the one that best
4 | describes the sentiment expressed in the tweet.
5 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
6 | most closely matches the sentiment expressed in the tweet.
7 | - 'Identify the emotion best suited to describe the content of the following tweet:
8 | anger, joy, optimism, or sadness?'
9 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
10 | fit for the sentiment conveyed in the tweet?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Which section of a newspaper would be the best fit for this news article - World
2 | News, Sports, Business or Science and Technology?
3 | - 'Who do you think should read this article: a Politician, an Athlete, a Business
4 | Executive, or a Scientist?'
5 | - In which part of a newspaper is this article likely to be published? World News,
6 | Sports, Business or Science and Technology?
7 | - Which part of a newspaper do you think this article belongs to? World News, Sports,
8 | Business or Science and Technology?
9 | - In which category of a newspaper is this article likely to be placed? World News,
10 | Sports, Business or Science and Technology?
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/anli.yaml:
--------------------------------------------------------------------------------
1 | - Is it safe to say that the hypothesis is true based on the given data?
2 | - Is it reasonable to assume that the hypothesis is true based on the given evidence?
3 | - Is it reasonable to make the claim that the hypothesis is true based on the available
4 | evidence?
5 | - Is it reasonable to believe that the hypothesis is true based on the given evidence?
6 | - Does the hypothesis logically follow from the premise?
7 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/boolq.yaml:
--------------------------------------------------------------------------------
1 | - Determine whether the answer to a question can be found in the passage, and if so,
2 | decide if the answer is true or false.
3 | - Evaluate if a question can be answered using the information provided in the passage,
4 | and if so, determine if the answer is true or false.
5 | - Determine whether a given question can be answered using information from the passage,
6 | and if so, determine if the answer is true or false.
7 | - Assess whether the question can be answered using only the information in the passage,
8 | and if so, determine if the answer is true or false.
9 | - Use the information provided in the passage to determine whether a true/false question
10 | can be answered with a true or false response.
11 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - Based on the information given below, determine the correct answer to the question.
2 | - Read the text below and select the appropriate answer to the question.
3 | - Based on the information provided below, decide on the correct answer to the question.
4 | - Select the most appropriate response to the question based on the information provided below.
5 | - Your task is to read the context and determine the answer to the question.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - Fill in the blank to finish the sentence.
2 | - Fill in the missing words to conclude the sentence.
3 | - Fill in the blank with the correct ending.
4 | - Write the last few words to complete the sentence.
5 | - Find an appropriate way to end the sentence.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/imdb.yaml:
--------------------------------------------------------------------------------
1 | - Is the sentiment conveyed by the author of this movie review positive or negative?
2 | - What is the sentiment expressed in this movie review?
3 | - Does the reviewer have a positive or negative view of the movie?
4 | - Did the movie critic have a good or bad impression of the movie?
5 | - Did the reviewer have a positive or negative experience watching the movie?
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - Each question should be answered succinctly and precisely.
2 | - Please answer the following question.
3 | - Provide a response to the following prompt.
4 | - Respond to the following prompt with an answer.
5 | - Each question must be answered with a concise reply.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - There is a query that has been on my mind for some time now.
2 | - Please provide an answer to the question below.
3 | - I've had a long-standing question that needs an answer.
4 | - Answer the following question with an appropriate response.
5 | - Respond to the following question with a brief expression.
6 |
--------------------------------------------------------------------------------
/instructions/low_perplexity_prompts/opt6b7/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - 'To receive full credit, choose the emotion that best fits the sentiment expressed
2 | in the tweet from the following options: anger, joy, optimism, or sadness.'
3 | - 'To score the maximum marks, categorize the tweet into one of the following emotions:
4 | anger, joy, optimism, or sadness.'
5 | - 'Identify the emotion best suited to describe the content of the following tweet:
6 | anger, joy, optimism, or sadness?'
7 | - Among the emotions provided, anger, joy, optimism, or sadness, select the one that
8 | most closely matches the sentiment expressed in the tweet.
9 | - Which one of the emotions provided, anger, joy, optimism, or sadness, is the best
10 | fit for the sentiment conveyed in the tweet?
11 |
--------------------------------------------------------------------------------
/instructions/manual/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - "What label best describes this news article?"
2 | - "Is this a piece of news regarding World Politics, Sports, Business, or Science and Technology?"
3 | - "Would you recommend the following article to a Politician, an Athlete, a Business executive or a Scientist?"
4 | - "Which of the following sections of a newspaper would this article likely appear in? World News, Sports, Business or Science and Technology"
5 | - "Which section of a newspaper would this article likely appear in?"
--------------------------------------------------------------------------------
/instructions/manual/anli.yaml:
--------------------------------------------------------------------------------
1 | - "Using only the description and what you know about the world, is the hypothesis definitely correct, incorrect, or inconclusive?"
2 | - "Given the premise should we assume that the hypothesis is true?"
3 | - "Given the premise, does the hypothesis follow?"
4 | - "Based on the passage, is the hypothesis true?"
5 | - "Are we justified in saying the hypothesis?"
--------------------------------------------------------------------------------
/instructions/manual/boolq.yaml:
--------------------------------------------------------------------------------
1 | - "Using only the passage and a question, is the answer true or false?"
2 | - "Given the passage, is the question true or false?"
3 | - "Using the given passage, answer the following true/false question."
4 | - "Based on the passage, is the answer true or false?"
5 | - "Using the passage, are we justified in saying the answer is true or false?"
--------------------------------------------------------------------------------
/instructions/manual/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - "Read the following context and answer the question."
2 | - "According to the below context, answer the following question."
3 | - "According to the below context, choose the best answer to the following question."
4 | - "Pick the best answer:"
5 | - "Based on the context, answer the question."
--------------------------------------------------------------------------------
/instructions/manual/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - "Complete the description with an appropriate ending:"
2 | - "Complete the sentence:"
3 | - "The task is to generate the ending for the sentence:"
4 | - "How does this sentence end?"
5 | - "How does the description likely end?"
--------------------------------------------------------------------------------
/instructions/manual/imdb.yaml:
--------------------------------------------------------------------------------
1 | - "The following movie review expresses what sentiment?"
2 | - "Did the reviewer find this movie positive or negative?"
3 | - "Is this review positive or negative?"
4 | - "How does this viewer feel about the movie?"
5 | - "What sentiment does the writer express for the movie?"
--------------------------------------------------------------------------------
/instructions/manual/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - "The goal is to predict an English answer string for an input English question. All questions can be answered using the contents of English Wikipedia."
2 | - "I've always wondered:"
3 | - "Answer the following question."
4 | - "For the following questions, predict a short-form answer."
5 | - "Answer each question with a brief expression"
--------------------------------------------------------------------------------
/instructions/manual/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - "The goal is to predict an English answer string for an input English question."
2 | - "Answer the following question."
3 | - "I've always wondered:"
4 | - "For the following questions, predict a short-form answer."
5 | - "Answer each question with a brief expression"
--------------------------------------------------------------------------------
/instructions/manual/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - "To get full credit in this exam, choose the correct emotion from the following choices: anger, joy, optimism, sadness"
2 | - "Which emotion among anger, joy, optimism, sadness best describes the feeling of the author of the following tweet?"
3 | - "Which emotion is best represented by the following tweet?"
4 | - "Categorize the tweet into one of the following options: anger, joy, optimism, sadness"
5 | - "What is the emotion of the text? Hint: anger, joy, optimism, sadness"
--------------------------------------------------------------------------------
/instructions/no_instruction/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/anli.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/boolq.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/imdb.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/nq_open.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/trivia_qa.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/no_instruction/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - ""
--------------------------------------------------------------------------------
/instructions/rlprompt/ag_news.yaml:
--------------------------------------------------------------------------------
1 | - Tools undergradCam firmwareCam
2 | - Platformchool Tech VoterInstallation
3 | - Parser undergradTeen Arduino puberty
4 | - GitHub Transgender classroom Tech feedback
5 | - FlashBugAlphaBug undergrad
--------------------------------------------------------------------------------
/instructions/rlprompt/anli.yaml:
--------------------------------------------------------------------------------
1 | - TermEnterFixed relaxing Maybe
2 | - Definition Maybe({ Suppose Imagine
3 | - "TokensTokens gmaxwell_>_>"
4 | - acky Maybe hilarious Apparently Collider
5 | - DriverAbility Perhaps Coinbase guiActiveUn
--------------------------------------------------------------------------------
/instructions/rlprompt/boolq.yaml:
--------------------------------------------------------------------------------
1 | - "CTR Gaza ``( Reef pesticide"
2 | - Enabled Guam VPN recognise Canterbury
3 | - DevelopmentZenProgress EntityItem embodies
4 | - Assistant Honour Honour Honour Honour
5 | - enchantment Xavier Nether specialization Gohan
--------------------------------------------------------------------------------
/instructions/rlprompt/cosmos_qa.yaml:
--------------------------------------------------------------------------------
1 | - ConnectionSensor Participant Objective Petition
2 | - reassuredreportedCNN therapistsreported
3 | - Germany recognisedVIDEO Bundesliga recognised
4 | - NameSpirit recognise dreams charism
5 | - Dog adoptingKim AutismKim
--------------------------------------------------------------------------------
/instructions/rlprompt/hellaswag.yaml:
--------------------------------------------------------------------------------
1 | - mascara antioxidants closet billionaires pores
2 | - ThemeSoundSexualildo FIG
3 | - Parameters Supported Playoff Parameters Playoff
4 | - AccessoryKidsEnjoy rinkPIN
5 | - Sapphire benchmarksSilver SnapdragonISO
--------------------------------------------------------------------------------
/instructions/rlprompt/imdb.yaml:
--------------------------------------------------------------------------------
1 | - UniversalWorkingFilm alleges headlined
2 | - Certification Certification Applicantintendent Received
3 | - DriverRatedRatedRatedRated
4 | - Education Referred inclusive Locatedcedented
5 | - Statement told Direction stressed assured
--------------------------------------------------------------------------------
/instructions/rlprompt/tweet_emotion.yaml:
--------------------------------------------------------------------------------
1 | - FlashActivStopMotion foreclosure
2 | - sequence expr motions confronting namely
3 | - AttackBeginAnswer("{\\
4 | - "Expression Citation verb={ invoking"
5 | - Context [\' echo {" represents
--------------------------------------------------------------------------------
/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from metrics.metric import Metric
2 | from metrics.zero_shot_accuracy import ZeroShotAccuracyMetric
3 | from metrics.few_shot_accuracy import FewShotAccuracyMetric
4 | from metrics.perturbational_accuracy import PerturbationalAccuracyMetric
5 | from metrics.selectional_sensitivity import SelectionalSensitivityMetric
6 | from metrics.permutational_sensitivity import PermutationalSensitivityMetric
--------------------------------------------------------------------------------
/metrics/few_shot_accuracy.py:
--------------------------------------------------------------------------------
1 | import statistics
2 | from typing import Any, List, Tuple, Dict
3 |
4 | import datasets
5 | from tqdm import tqdm
6 |
7 | from data.dataset import Dataset
8 | from decoders.decoder import Decoder
9 | from metrics.metric import Metric
10 | from models.base import BaseModel
11 | from templates.few_shot_template import FewShotTemplate
12 |
13 |
14 | class FewShotAccuracyMetric(Metric):
15 |
16 | def __init__(
17 | self,
18 | model: BaseModel,
19 | dataset: Dataset,
20 | template: FewShotTemplate,
21 | decoder: Decoder,
22 | num_demonstrations: int,
23 | num_combinations: int,
24 | num_test_instances: int,
25 | ):
26 | """
27 | Metric for evaluating few-shot accuracy.
28 |
29 | model: model to evaluate.
30 | dataset: dataset to evaluate on.
31 | template: template to use for generating prompts.
32 | decoder: decoder to use for decoding.
33 | num_demonstrations: K for K-shot learning.
34 | num_combinations: number of combinations of K-shot learning to try.
35 | num_test_instances: number of test instances to evaluate on.
36 | """
37 |
38 | super().__init__(model, dataset, template, decoder)
39 | self.num_demonstrations = num_demonstrations
40 | self.num_combinations = num_combinations
41 | self.num_test_instances = num_test_instances
42 |
43 | def create_inputs(self) -> Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]:
44 | # create inputs for calculating few-shot accuracy
45 |
46 | demonstrations_list = []
47 | for seed in range(self.num_combinations):
48 | demonstration_instances = self.dataset.sample_instances("train", self.num_demonstrations, seed=seed)
49 | demonstrations_list.append(demonstration_instances)
50 |
51 | test_instances = self.dataset.sample_instances("test", self.num_test_instances)
52 | return (demonstrations_list, test_instances)
53 |
54 | def evaluate(
55 | self,
56 | inputs: Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
57 | ) -> Dict[str, Any]:
58 |
59 | # unpack inputs
60 | demonstrations_list, test_instances = inputs
61 |
62 | # remove labels from test instances
63 | test_instances_no_label = datasets.Dataset.from_list(test_instances).remove_columns([self.dataset.label_key])
64 | test_instance_labels = [test_instance[self.dataset.label_key] for test_instance in test_instances]
65 |
66 | # compute accuracy for each combination of demonstrations
67 | accuracies = []
68 | for demonstrations in tqdm(demonstrations_list):
69 | predicted_outputs = [
70 | output["prediction"]
71 | for output in self.decoder.decode(
72 | self.model,
73 | demonstrations,
74 | test_instances_no_label,
75 | )
76 | ]
77 |
78 | # This metric uses exact match for correctness
79 | correctness_indicators = [
80 | self.eq_metric(predicted_output, gt_output)
81 | for gt_output, predicted_output in zip(
82 | test_instance_labels, predicted_outputs
83 | )
84 | ]
85 |
86 | # compute accuracy
87 | accuracies.append(sum(correctness_indicators) / len(correctness_indicators))
88 |
89 | # return mean few-shot accuracy, and all few-shot accuracies
90 | return {
91 | "few_shot_accuracy": statistics.mean(accuracies),
92 | "all_few_shot_accuracies": accuracies
93 | }
94 |
--------------------------------------------------------------------------------
/metrics/metric.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 |
3 | from data.dataset import Dataset
4 | from decoders.decoder import Decoder
5 | from metrics.utils import quasi_exact_match
6 | from models.base import BaseModel
7 | from templates.few_shot_template import FewShotTemplate
8 |
9 |
10 | class Metric:
11 |
12 | def __init__(
13 | self,
14 | model: BaseModel,
15 | dataset: Dataset,
16 | template: FewShotTemplate,
17 | decoder: Decoder,
18 | ):
19 |
20 | """
21 | Parent class for all metrics.
22 |
23 | model: model to evaluate.
24 | dataset: dataset to evaluate on.
25 | template: template to use for generating prompts.
26 | decoder: decoder to use for decoding.
27 | """
28 |
29 | self.model = model
30 | self.dataset = dataset
31 | self.template = template
32 | self.decoder = decoder
33 | self.eq_metric = quasi_exact_match
34 |
35 | def create_inputs(self) -> Any:
36 | # has to be implemented by child classes
37 | raise NotImplementedError
38 |
39 | def evaluate(self, inputs: Any) -> Dict:
40 | # has to be implemented by child classes
41 | raise NotImplementedError
42 |
--------------------------------------------------------------------------------
/metrics/permutational_sensitivity.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import random
3 | import statistics
4 | from typing import Any, List, Tuple, Dict
5 |
6 | import datasets
7 | from tqdm import tqdm
8 |
9 | from data import Dataset
10 | from decoders.decoder import Decoder
11 | from metrics.metric import Metric
12 | from models import BaseModel
13 | from templates import FewShotTemplate
14 |
15 |
16 | class PermutationalSensitivityMetric(Metric):
17 | def __init__(
18 | self,
19 | model: BaseModel,
20 | dataset: Dataset,
21 | template: FewShotTemplate,
22 | decoder: Decoder,
23 | num_demonstrations: int,
24 | num_combinations: int,
25 | num_permutations: int,
26 | num_test_instances: int,
27 | ):
28 | """
29 | Metric for evaluating permutational sensitivity.
30 |
31 | model: model to evaluate.
32 | dataset: dataset to evaluate on.
33 | template: template to use for generating prompts.
34 | decoder: decoder to use for decoding.
35 | num_demonstrations: K for K-shot learning.
36 | num_combinations: number of combinations of K-shot learning to try.
37 | num_permutations: number of permutations to try for each combination.
38 | num_test_instances: number of test instances to evaluate on.
39 | """
40 |
41 | super().__init__(model, dataset, template, decoder)
42 | self.num_demonstrations = num_demonstrations
43 | self.num_combinations = num_combinations
44 | self.num_permutations = num_permutations
45 | self.num_test_instances = num_test_instances
46 |
47 | def create_inputs(self) -> Tuple[List[List[List[Dict[str, Any]]]], List[Dict[str, Any]]]:
48 | # create inputs for calculating permutational robustness
49 |
50 | combinations_list = []
51 | # for each combination of demonstrations
52 | for seed in range(self.num_combinations):
53 | demonstration_instances = self.dataset.sample_instances("train", self.num_demonstrations, seed=seed)
54 | permutations = []
55 | # create num_permutations permutations
56 | for _ in range(self.num_permutations):
57 | random.shuffle(demonstration_instances)
58 | permutations.append(copy.deepcopy(demonstration_instances))
59 | combinations_list.append(permutations)
60 |
61 | # sample test instances
62 | test_instances = self.dataset.sample_instances(split="test", sample_size=self.num_test_instances)
63 | return (combinations_list, # list of combinations; each combination is a list of permutations; each permutation is a list of demonstrations
64 | test_instances)
65 |
66 | def evaluate(
67 | self,
68 | inputs: Tuple[List[List[List[Dict[str, Any]]]], List[Dict[str, Any]]]
69 | ) -> Dict[str, Any]:
70 |
71 | # unpack inputs
72 | combinations_list, test_instances = inputs
73 |
74 | # remove labels from test instances
75 | test_instances_no_label = datasets.Dataset.from_list(test_instances).remove_columns([self.dataset.label_key])
76 | test_instance_labels = [test_instance[self.dataset.label_key] for test_instance in test_instances]
77 |
78 | # evaluate each combination of demonstrations
79 | permutation_stdevs = []
80 | for permutations_list in tqdm(combinations_list):
81 | permutationwise_accuracies = []
82 | # evaluate each permutation on full test set
83 | for demonstrations_list in permutations_list:
84 | predicted_outputs = [
85 | output["prediction"]
86 | for output in self.decoder.decode(
87 | self.model,
88 | demonstrations_list,
89 | test_instances_no_label,
90 | )
91 | ]
92 | # This metric uses exact match for correctness
93 | correctness_indicators = [
94 | self.eq_metric(predicted_output, gt_output)
95 | for gt_output, predicted_output in zip(
96 | test_instance_labels, predicted_outputs
97 | )
98 | ]
99 | # calculate accuracy for this permutation
100 | permutationwise_accuracies.append(statistics.mean(correctness_indicators))
101 | permutation_stdevs.append(statistics.stdev(permutationwise_accuracies))
102 |
103 | # return mean permutational stdev, and all permutational stdevs
104 | return {
105 | "permutational_sensitivity": statistics.mean(permutation_stdevs),
106 | "all_permutational_stdevs": permutation_stdevs
107 | }
108 |
--------------------------------------------------------------------------------
/metrics/perturbational_accuracy.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Tuple, Any
2 | import copy
3 | import statistics
4 | from random import Random
5 |
6 | import datasets
7 | from helm.benchmark.augmentations.mild_mix_perturbation import MildMixPerturbation
8 | from tqdm import tqdm
9 |
10 | from data import Dataset
11 | from decoders import Decoder
12 | from metrics.metric import Metric
13 | from models.base import BaseModel
14 | from templates.few_shot_template import FewShotTemplate
15 |
16 |
17 | class PerturbationalAccuracyMetric(Metric):
18 | """From Holistic Evaluation of Language Models
19 | Credit to: https://github.com/stanford-crfm/helm
20 | """
21 |
22 | def __init__(
23 | self,
24 | model: BaseModel,
25 | dataset: Dataset,
26 | template: FewShotTemplate,
27 | decoder: Decoder,
28 | num_demonstrations: int,
29 | num_combinations: int,
30 | num_test_instances: int,
31 | seed: int = 0
32 | ):
33 | """
34 | Metric for evaluating few-shot perturbation accuracy.
35 |
36 | model: model to evaluate.
37 | dataset: dataset to evaluate on.
38 | template: template to use for generating prompts.
39 | decoder: decoder to use for decoding.
40 | num_demonstrations: K for K-shot learning.
41 | num_combinations: number of combinations of K-shot learning to try.
42 | num_test_instances: number of test instances to evaluate on.
43 | """
44 |
45 | super().__init__(model, dataset, template, decoder)
46 | self.num_demonstrations = num_demonstrations
47 | self.num_combinations = num_combinations
48 | self.num_test_instances = num_test_instances
49 |
50 | # initialize random number generator
51 | self.rng = Random()
52 | self.rng.seed(seed)
53 |
54 | # initialize HELM perturbation object
55 | self.mild_mix_perturbation = MildMixPerturbation()
56 |
57 | def _apply_perturbation(self, example: Dict[str, Any]) -> Dict[str, Any]:
58 | # apply perturbation to all text fields in an example
59 |
60 | example_copy = copy.deepcopy(example)
61 | for text_key in self.dataset.text_keys:
62 | example_copy[text_key] = self.mild_mix_perturbation.perturb(
63 | example_copy[text_key], self.rng
64 | )
65 |
66 | return example_copy
67 |
68 | def create_inputs(self) -> Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]:
69 | # create inputs for calculating perturbation accuracy
70 |
71 | demonstrations_list = []
72 | for seed in range(self.num_combinations):
73 | demonstration_instances = self.dataset.sample_instances("train", self.num_demonstrations, seed=seed)
74 | demonstrations_list.append(demonstration_instances)
75 |
76 | test_instances = self.dataset.sample_instances("test", self.num_test_instances)
77 | return (demonstrations_list, test_instances)
78 |
79 | def evaluate(
80 | self,
81 | inputs: Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
82 | ) -> Dict[str, Any]:
83 |
84 | # unpack inputs
85 | demonstrations_list, test_instances = inputs
86 |
87 | # remove labels from test instances
88 | test_instances_no_label = datasets.Dataset.from_list(test_instances).remove_columns([self.dataset.label_key])
89 | test_instance_labels = [test_instance[self.dataset.label_key] for test_instance in test_instances]
90 |
91 | # apply perturbation to all text fields in test instances
92 | test_instances_perturbed_no_label = test_instances_no_label.map(self._apply_perturbation)
93 |
94 | # evaluate each combination of demonstrations on perturbed and unperturbed test instances
95 | accuracies_unperturbed = []
96 | accuracies_perturbed = []
97 | for demonstrations in tqdm(demonstrations_list):
98 | predicted_outputs_unperturbed = [
99 | output["prediction"]
100 | for output in self.decoder.decode(
101 | self.model,
102 | demonstrations,
103 | test_instances_no_label,
104 | )
105 | ]
106 | predicted_outputs_perturbed = [
107 | output["prediction"]
108 | for output in self.decoder.decode(
109 | self.model,
110 | demonstrations,
111 | test_instances_perturbed_no_label,
112 | )
113 | ]
114 |
115 | # This metric uses exact match for correctness
116 | correctness_indicators_unperturbed = [
117 | self.eq_metric(predicted_output, gt_output)
118 | for gt_output, predicted_output in zip(
119 | test_instance_labels, predicted_outputs_unperturbed
120 | )
121 | ]
122 | correctness_indicators_perturbed = [
123 | self.eq_metric(predicted_output, gt_output)
124 | for gt_output, predicted_output in zip(
125 | test_instance_labels, predicted_outputs_perturbed
126 | )
127 | ]
128 |
129 | # compute accuracy
130 | accuracies_unperturbed.append(statistics.mean(correctness_indicators_unperturbed))
131 | accuracies_perturbed.append(statistics.mean(correctness_indicators_perturbed))
132 |
133 | # compute accuracy statistics
134 | mean_accuracy_unperturbed = statistics.mean(accuracies_unperturbed)
135 | mean_accuracy_perturbed = statistics.mean(accuracies_perturbed)
136 | mean_accuracy_drop = mean_accuracy_unperturbed - mean_accuracy_perturbed
137 |
138 | # return accuracies
139 | return {
140 | "unperturbed_accuracy": mean_accuracy_unperturbed,
141 | "perturbed_accuracy": mean_accuracy_perturbed,
142 | "perturbation_drop_in_accuracy": mean_accuracy_drop,
143 | "all_unperturbed_accuracies": accuracies_unperturbed,
144 | "all_perturbed_accuracies": accuracies_perturbed
145 | }
146 |
--------------------------------------------------------------------------------
/metrics/selectional_sensitivity.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import statistics
3 | from typing import Any, List, Tuple, Dict
4 |
5 | import datasets
6 | from tqdm import tqdm
7 |
8 | from data.dataset import Dataset
9 | from decoders.decoder import Decoder
10 | from metrics.metric import Metric
11 | from models.base import BaseModel
12 | from templates.few_shot_template import FewShotTemplate
13 |
14 |
15 | class SelectionalSensitivityMetric(Metric):
16 | def __init__(
17 | self,
18 | model: BaseModel,
19 | dataset: Dataset,
20 | template: FewShotTemplate,
21 | decoder: Decoder,
22 | num_demonstrations: int,
23 | num_combinations: int,
24 | num_test_instances: int,
25 | ):
26 |
27 | """
28 | Metric for evaluating selectional sensitivity.
29 |
30 | model: model to evaluate.
31 | dataset: dataset to evaluate on.
32 | template: template to use for generating prompts.
33 | decoder: decoder to use for decoding.
34 | num_demonstrations: K for K-shot learning.
35 | num_combinations: number of combinations of K-shot learning to try.
36 | num_test_instances: number of test instances to evaluate on.
37 | """
38 |
39 | super().__init__(model, dataset, template, decoder)
40 | self.num_demonstrations = num_demonstrations
41 | self.num_combinations = num_combinations
42 | self.num_test_instances = num_test_instances
43 |
44 | def create_inputs(self) -> Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]:
45 | # create inputs for calculating selectional sensitivity
46 |
47 | demonstrations_list = []
48 | for seed in range(self.num_combinations):
49 | demonstration_instances = self.dataset.sample_instances("train", self.num_demonstrations, seed=seed)
50 | demonstrations_list.append(demonstration_instances)
51 |
52 | test_instances = self.dataset.sample_instances("test", self.num_test_instances)
53 | return (demonstrations_list, test_instances)
54 |
55 | def evaluate(
56 | self,
57 | inputs: Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
58 | ) -> Dict[str, Any]:
59 |
60 | # unpack inputs
61 | demonstrations_list, test_instances = inputs
62 |
63 | # remove labels from test instances
64 | test_instances_no_label = datasets.Dataset.from_list(test_instances).remove_columns([self.dataset.label_key])
65 | test_instance_labels = [test_instance[self.dataset.label_key] for test_instance in test_instances]
66 |
67 | # evaluate on each combination of demonstrations
68 | accuracies = []
69 | for demonstrations in tqdm(demonstrations_list):
70 | predicted_outputs = [
71 | output["prediction"]
72 | for output in self.decoder.decode(
73 | self.model,
74 | demonstrations,
75 | test_instances_no_label,
76 | )
77 | ]
78 |
79 | # This metric uses exact match for correctness
80 | correctness_indicators = [
81 | self.eq_metric(predicted_output, gt_output)
82 | for gt_output, predicted_output in zip(
83 | test_instance_labels, predicted_outputs
84 | )
85 | ]
86 |
87 | # compute accuracy
88 | accuracies.append(statistics.mean(correctness_indicators))
89 |
90 | # return mean accuracy, standard deviation to selection, and list of accuracies
91 | return {
92 | "selectional_sensitivity": statistics.stdev(accuracies),
93 | "all_selectional_accuracies": accuracies,
94 | }
95 |
--------------------------------------------------------------------------------
/metrics/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 | from typing import Any, Union, List
4 |
5 |
6 | def exact_match_stripped(pred: Any, ground_truth: Union[Any, List[Any], dict[Any]]):
7 | if isinstance(ground_truth, dict):
8 | # hotfix for trivia_qa
9 | return exact_match_stripped(pred, ground_truth["aliases"])
10 | if isinstance(ground_truth, list):
11 | return any(
12 | exact_match_stripped(pred, ground_truth_single)
13 | for ground_truth_single in ground_truth
14 | )
15 | else:
16 | return str(pred).strip() == str(ground_truth).strip()
17 |
18 |
19 | def exact_match(pred: Any, ground_truth: Union[Any, List[Any], dict[Any]]):
20 | if isinstance(ground_truth, dict):
21 | # hotfix for trivia_qa
22 | return exact_match(pred, ground_truth["aliases"])
23 | if isinstance(ground_truth, list):
24 | return any(
25 | exact_match(pred, ground_truth_single)
26 | for ground_truth_single in ground_truth
27 | )
28 | else:
29 | return pred == ground_truth
30 |
31 |
32 | def _normalize_text(text: str) -> str:
33 | """Lower text and remove punctuation, articles and extra whitespace.
34 | Copied from the [QuAC](http://quac.ai/) evaluation script found at
35 | https://s3.amazonaws.com/my89public/quac/scorer.py"""
36 |
37 | def remove_articles(text: str) -> str:
38 | return re.sub(r"\b(a|an|the)\b", " ", text)
39 |
40 | def white_space_fix(text: str) -> str:
41 | return " ".join(text.split())
42 |
43 | def remove_punc(text: str) -> str:
44 | exclude = set(string.punctuation)
45 | return "".join(ch for ch in text if ch not in exclude)
46 |
47 | def lower(text: str) -> str:
48 | return text.lower()
49 |
50 | return white_space_fix(remove_articles(remove_punc(lower(text)))).strip()
51 |
52 |
53 | def quasi_exact_match(
54 | pred: Any, ground_truth: Union[Any, List[Any], dict[Any]]
55 | ) -> float:
56 | """From CRFM HELM
57 | https://github.com/stanford-crfm/helm/blob/main/src/helm/benchmark/metrics/basic_metrics.py
58 | """
59 | if isinstance(ground_truth, dict):
60 | # hotfix for trivia_qa
61 | return quasi_exact_match(pred, ground_truth["aliases"])
62 | if isinstance(ground_truth, list):
63 | return any(
64 | quasi_exact_match(pred, ground_truth_single)
65 | for ground_truth_single in ground_truth
66 | )
67 | else:
68 | return _normalize_text(str(ground_truth)) == _normalize_text(str(pred))
69 |
--------------------------------------------------------------------------------
/metrics/zero_shot_accuracy.py:
--------------------------------------------------------------------------------
1 | import statistics
2 | from typing import Any, List, Dict
3 |
4 | import datasets
5 | from tqdm import tqdm
6 |
7 | from data.dataset import Dataset
8 | from decoders.decoder import Decoder
9 | from metrics.metric import Metric
10 | from models.base import BaseModel
11 | from templates.few_shot_template import FewShotTemplate
12 |
13 |
14 | class ZeroShotAccuracyMetric(Metric):
15 |
16 | def __init__(
17 | self,
18 | model: BaseModel,
19 | dataset: Dataset,
20 | template: FewShotTemplate,
21 | decoder: Decoder,
22 | num_test_instances: int,
23 | ):
24 | """
25 | Metric for evaluating zero-shot accuracy.
26 |
27 | model: model to evaluate.
28 | dataset: dataset to evaluate on.
29 | template: template to use for generating prompts.
30 | decoder: decoder to use for decoding.
31 | num_test_instances: number of test instances to evaluate on.
32 | """
33 |
34 | super().__init__(model, dataset, template, decoder)
35 | self.num_test_instances = num_test_instances
36 |
37 | def create_inputs(self) -> List[Dict[str, Any]]:
38 | # create inputs for calculating zero-shot accuracy
39 |
40 | test_instances = self.dataset.sample_instances("test", self.num_test_instances)
41 | return test_instances
42 |
43 | def evaluate(
44 | self,
45 | inputs: List[Dict[str, Any]]
46 | ) -> Dict[str, Any]:
47 |
48 | # unpack inputs
49 | test_instances = inputs
50 |
51 | # remove labels from test instances
52 | test_instances_no_label = datasets.Dataset.from_list(test_instances).remove_columns([self.dataset.label_key])
53 | test_instance_labels = [test_instance[self.dataset.label_key] for test_instance in test_instances]
54 |
55 | # get predictions
56 | predicted_outputs = [
57 | output["prediction"]
58 | for output in self.decoder.decode(
59 | self.model,
60 | [],
61 | test_instances_no_label,
62 | )
63 | ]
64 |
65 | # This metric uses exact match for correctness
66 | correctness_indicators = [
67 | self.eq_metric(predicted_output, gt_output)
68 | for gt_output, predicted_output in zip(
69 | test_instance_labels, predicted_outputs
70 | )
71 | ]
72 |
73 | # compute accuracy
74 | accuracy = statistics.mean(correctness_indicators)
75 |
76 | # return mean few-shot accuracy, and all few-shot accuracies
77 | return {
78 | "zero_shot_accuracy": accuracy
79 | }
80 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from models.base import BaseModel
2 | from models.bloom1b1 import Bloom1B1
3 | from models.bloom1b7 import Bloom1B7
4 | from models.bloom3b import Bloom3B
5 | from models.bloom7b1 import Bloom7B1
6 | from models.causal_lm import CausalLM
7 | from models.gptneo1b3 import GPTNeo1B3
8 | from models.gptneo2b7 import GPTNeo2B7
9 | from models.gptneox20b import GPTNeoX20B
10 | from models.llama7b import LLaMA7B
11 | from models.llama13b import LLaMA13B
12 | from models.masked_lm import MaskedLM
13 | from models.opt1b3 import OPT1B3
14 | from models.opt2b7 import OPT2B7
15 | from models.opt6b7 import OPT6B7
16 | from models.opt13b import OPT13B
17 | from models.stablelmbase3b import StableLMBase3B
18 | from models.stablelmbase7b import StableLMBase7B
19 | from models.stablelmtuned3b import StableLMTuned3B
20 | from models.stablelmtuned7b import StableLMTuned7B
21 |
--------------------------------------------------------------------------------
/models/base.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class BaseModel:
5 |
6 | def __init__(self, name: str, model, tokenizer, device: str):
7 | self.name = name
8 | self.hf_model = model
9 | self.tokenizer = tokenizer
10 | self.device = device
11 |
12 | def forward(self, *args, **kwargs):
13 | raise NotImplementedError
14 |
--------------------------------------------------------------------------------
/models/bloom1b1.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class Bloom1B1(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("bigscience/bloom-1b1")
10 |
--------------------------------------------------------------------------------
/models/bloom1b7.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class Bloom1B7(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("bigscience/bloom-1b7")
10 |
--------------------------------------------------------------------------------
/models/bloom3b.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class Bloom3B(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("bigscience/bloom-3b")
10 |
--------------------------------------------------------------------------------
/models/bloom7b1.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class Bloom7B1(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("bigscience/bloom-7b1")
10 |
--------------------------------------------------------------------------------
/models/causal_lm.py:
--------------------------------------------------------------------------------
1 | import collections
2 | from typing import Optional, Tuple
3 |
4 | import torch
5 | import transformers
6 | from tqdm import tqdm
7 |
8 | from models.base import BaseModel
9 |
10 |
11 | class CausalLM(BaseModel):
12 |
13 | def __init__(self, name: str):
14 | if torch.cuda.is_available():
15 | model = transformers.AutoModelForCausalLM.from_pretrained(
16 | name, device_map="auto", torch_dtype=torch.float16)
17 | device = f"cuda:{list(model.hf_device_map.values())[0]}"
18 | else:
19 | model = transformers.AutoModelForCausalLM.from_pretrained(name)
20 | device = "cpu"
21 |
22 | tokenizer = transformers.AutoTokenizer.from_pretrained(name)
23 | tokenizer.pad_token = tokenizer.eos_token
24 |
25 | super().__init__(name, model, tokenizer, device)
26 |
27 | def forward(self, *args, **kwargs):
28 | return self.hf_model.forward(*args, **kwargs)
29 |
--------------------------------------------------------------------------------
/models/gptneo1b3.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class GPTNeo1B3(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("EleutherAI/gpt-neo-1.3B")
10 |
--------------------------------------------------------------------------------
/models/gptneo2b7.py:
--------------------------------------------------------------------------------
1 | from models.causal_lm import CausalLM
2 |
3 |
4 | class GPTNeo2B7(CausalLM):
5 |
6 | def __init__(self):
7 | super().__init__("EleutherAI/gpt-neo-2.7B")
8 |
--------------------------------------------------------------------------------
/models/gptneox20b.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class GPTNeoX20B(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("EleutherAI/gpt-neox-20b")
10 |
--------------------------------------------------------------------------------
/models/llama13b.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class LLaMA13B(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__(name="/path/to/llama13b/")
10 |
--------------------------------------------------------------------------------
/models/llama7b.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class LLaMA7B(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__(name="/path/to/llama7b/")
10 |
--------------------------------------------------------------------------------
/models/masked_lm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 |
4 | from models.base import BaseModel
5 |
6 |
7 | class MaskedLM(BaseModel):
8 |
9 | def __init__(self, name: str):
10 | if torch.cuda.is_available():
11 | model = transformers.AutoModelForMaskedLM.from_pretrained(
12 | name, device_map="auto")
13 | else:
14 | model = transformers.AutoModelForMaskedLM.from_pretrained(name)
15 |
16 | tokenizer = transformers.AutoTokenizer.from_pretrained(name)
17 | tokenizer.pad_token = self.tokenizer.eos_token
18 | super().__init__(name, model, tokenizer)
19 |
--------------------------------------------------------------------------------
/models/opt13b.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class OPT13B(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("facebook/opt-13b")
10 |
--------------------------------------------------------------------------------
/models/opt1b3.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class OPT1B3(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("facebook/opt-1.3b")
10 |
--------------------------------------------------------------------------------
/models/opt2b7.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class OPT2B7(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("facebook/opt-2.7b")
10 |
--------------------------------------------------------------------------------
/models/opt6b7.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from models.causal_lm import CausalLM
4 |
5 |
6 | class OPT6B7(CausalLM):
7 |
8 | def __init__(self):
9 | super().__init__("facebook/opt-6.7b")
10 |
--------------------------------------------------------------------------------
/models/stablelmbase3b.py:
--------------------------------------------------------------------------------
1 | from models.causal_lm import CausalLM
2 |
3 |
4 | class StableLMBase3B(CausalLM):
5 |
6 | def __init__(self):
7 | super().__init__("StabilityAI/stablelm-base-alpha-3b")
8 |
--------------------------------------------------------------------------------
/models/stablelmbase7b.py:
--------------------------------------------------------------------------------
1 | from models.causal_lm import CausalLM
2 |
3 |
4 | class StableLMBase7B(CausalLM):
5 |
6 | def __init__(self):
7 | super().__init__("StabilityAI/stablelm-base-alpha-7b")
8 |
--------------------------------------------------------------------------------
/models/stablelmtuned3b.py:
--------------------------------------------------------------------------------
1 | from models.causal_lm import CausalLM
2 |
3 |
4 | class StableLMTuned3B(CausalLM):
5 |
6 | def __init__(self):
7 | super().__init__("StabilityAI/stablelm-tuned-alpha-3b")
8 |
--------------------------------------------------------------------------------
/models/stablelmtuned7b.py:
--------------------------------------------------------------------------------
1 | from models.causal_lm import CausalLM
2 |
3 |
4 | class StableLMTuned7B(CausalLM):
5 |
6 | def __init__(self):
7 | super().__init__("StabilityAI/stablelm-tuned-alpha-7b")
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | 2captcha-python==1.2.1
2 | absl-py==1.4.0
3 | accelerate==0.18.0
4 | aiodns==3.0.0
5 | aiohttp==3.8.4
6 | aiohttp-retry==2.8.3
7 | aiosignal==1.3.1
8 | aleph-alpha-client==2.14.0
9 | async-timeout==4.0.2
10 | attrs==23.1.0
11 | beautifulsoup4==4.12.2
12 | bert-score==0.3.13
13 | blanc==0.3.1
14 | blis==0.7.9
15 | boto3==1.28.2
16 | botocore==1.31.2
17 | bottle==0.12.25
18 | cachetools==5.3.1
19 | catalogue==2.0.8
20 | cattrs==22.2.0
21 | certifi==2023.5.7
22 | cffi==1.15.1
23 | charset-normalizer==3.2.0
24 | click==8.1.4
25 | colorama==0.4.6
26 | colorcet==3.0.1
27 | contourpy==1.1.0
28 | crfm-helm==0.2.2
29 | cycler==0.11.0
30 | cymem==2.0.7
31 | Cython==0.29.36
32 | dacite==1.6.0
33 | datasets==2.13.1
34 | dill==0.3.5.1
35 | emoji==2.6.0
36 | exceptiongroup==1.1.2
37 | filelock==3.12.2
38 | fonttools==4.40.0
39 | frozenlist==1.3.3
40 | fsspec==2023.6.0
41 | gdown==4.4.0
42 | gin-config==0.5.0
43 | google-api-core==2.11.1
44 | google-api-python-client==2.64.0
45 | google-auth==2.22.0
46 | google-auth-httplib2==0.1.0
47 | googleapis-common-protos==1.59.1
48 | gunicorn==20.1.0
49 | h11==0.14.0
50 | httplib2==0.22.0
51 | huggingface-hub==0.16.4
52 | icetk==0.0.4
53 | idna==3.4
54 | importlib-resources==5.10.4
55 | Jinja2==3.1.2
56 | jmespath==1.0.1
57 | joblib==1.3.1
58 | jsonlines==3.1.0
59 | kiwisolver==1.4.4
60 | langcodes==3.3.0
61 | llvmlite==0.39.1
62 | lxml==4.9.3
63 | Mako==1.2.4
64 | MarkupSafe==2.1.3
65 | matplotlib==3.6.3
66 | moverscore==1.0.3
67 | mpmath==1.3.0
68 | multidict==6.0.4
69 | multiprocess==0.70.13
70 | murmurhash==1.0.9
71 | networkx==3.1
72 | nltk==3.8.1
73 | numba==0.56.4
74 | numpy==1.23.5
75 | openai==0.27.8
76 | outcome==1.2.0
77 | packaging==23.1
78 | pandas==2.0.3
79 | param==1.13.0
80 | parameterized==0.9.0
81 | pathy==0.10.2
82 | Pillow==10.0.0
83 | portalocker==2.7.0
84 | preshed==3.0.8
85 | protobuf==3.20.3
86 | psutil==5.9.5
87 | pyarrow==12.0.1
88 | pyasn1==0.5.0
89 | pyasn1-modules==0.3.0
90 | pycares==4.3.0
91 | pycparser==2.21
92 | pyct==0.5.0
93 | pydantic==1.8.2
94 | pyemd==0.5.1
95 | pyext==0.7
96 | pyhocon==0.3.60
97 | pymongo==4.2.0
98 | pyparsing==3.1.0
99 | PySocks==1.7.1
100 | python-dateutil==2.8.2
101 | pytorch-pretrained-bert==0.6.2
102 | pytrec-eval==0.5
103 | pytz==2023.3
104 | PyYAML==6.0
105 | regex==2023.6.3
106 | requests==2.31.0
107 | responses==0.18.0
108 | retrying==1.3.4
109 | revChatGPT==0.1.1
110 | rouge-score==0.1.2
111 | rsa==4.9
112 | s3transfer==0.6.1
113 | sacrebleu==2.2.1
114 | sacremoses==0.0.53
115 | safetensors==0.3.1
116 | scikit-learn==1.1.3
117 | scipy==1.9.3
118 | seaborn==0.11.2
119 | selenium==4.10.0
120 | sentencepiece==0.1.99
121 | six==1.16.0
122 | smart-open==6.3.0
123 | sniffio==1.3.0
124 | sortedcontainers==2.4.0
125 | soupsieve==2.4.1
126 | spacy==3.2.6
127 | spacy-legacy==3.0.12
128 | spacy-loggers==1.0.4
129 | sqlitedict==1.7.0
130 | srsly==2.4.6
131 | stanza==1.5.0
132 | summ-eval==0.892
133 | sympy==1.11.1
134 | tabulate==0.9.0
135 | thinc==8.0.17
136 | threadpoolctl==3.1.0
137 | tls-client==0.2.1
138 | tokenizers==0.13.3
139 | tqdm==4.64.1
140 | transformers==4.30.2
141 | trio==0.22.1
142 | trio-websocket==0.10.3
143 | typer==0.4.2
144 | typing==3.7.4.3
145 | typing_extensions==4.5.0
146 | tzdata==2023.3
147 | uncertainty-calibration==0.1.4
148 | undetected-chromedriver==3.5.0
149 | uritemplate==4.1.1
150 | urllib3==1.26.16
151 | wasabi==0.10.1
152 | websocket-client==1.3.3
153 | websockets==11.0.3
154 | wsproto==1.2.0
155 | xlrd==2.0.1
156 | xxhash==3.2.0
157 | yarl==1.9.2
158 | zipp==3.16.0
159 | zstandard==0.18.0
160 |
--------------------------------------------------------------------------------
/templates/__init__.py:
--------------------------------------------------------------------------------
1 | from templates.few_shot_template import FewShotTemplate
2 | from templates.instruction_based_fs_template import \
3 | InstructionBasedFewShotTemplate
4 |
--------------------------------------------------------------------------------
/templates/few_shot_template.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Dict, Any, List
2 | from jinja2 import Environment
3 |
4 | from data import Dataset, get_dataset
5 |
6 |
7 | class FewShotTemplate:
8 |
9 | def __init__(self,
10 | jinja2_file_path: Optional[str] = None,
11 | jinja2_string: Optional[str] = None):
12 | """
13 | General few shot template class.
14 |
15 | jinja2_file_path: path to a jinja2 template file.
16 | jinja2_string: string containing a jinja2 template.
17 |
18 | Accepts either a path to a jinja2 template file or a string containing the template.
19 |
20 | - The file must also define `dataset_name` which represents the name
21 | of the dataset, used in data.get_dataset.
22 | - To reference few shot examples, the template should use `demonstrations`.
23 | - To reference the test example, the template should use `test_example`.
24 | """
25 |
26 | if not (jinja2_file_path or jinja2_string):
27 | raise ValueError("Neither path to jinja2 template or string jinja2 template were provided.")
28 | elif jinja2_file_path and jinja2_string:
29 | raise ValueError("You only need to specify one of jinja2_file_path or jinja2_string not both.")
30 | elif jinja2_file_path:
31 | jinja2_string = open(jinja2_file_path, "r").read()
32 |
33 | self.template = Environment().from_string(jinja2_string)
34 | module = self._get_dummy_module()
35 |
36 | if "dataset_name" not in dir(module):
37 | raise ValueError("You must ensure your jinja2 template sets a `dataset_name`.")
38 | self.dataset_name = module.dataset_name
39 |
40 | # label_map is only expected for classification datasets
41 | if "label_map" in dir(module):
42 | self.label_map = module.label_map
43 |
44 |
45 | def _get_dummy_module(self) -> Any:
46 | # dummy context to extract the label map and the dataset name
47 | # `endings` is used in MCQ datasets
48 | dummy_context = {
49 | "dataset_name": None,
50 | "label_map": None,
51 | "label": None,
52 | "demonstrations": [],
53 | "test_example": {
54 | "": "",
55 | "endings": [],
56 | "answer": None,
57 | "label": None
58 | }
59 | }
60 | module = self.template.make_module(dummy_context)
61 | return module
62 |
63 | def get_dataset(self) -> Dataset:
64 | # Return the dataset specified in the prompt template
65 | return get_dataset(self.dataset_name)
66 |
67 | def render(self,
68 | demonstrations: List[Dict[str, Any]],
69 | test_example: Optional[Dict[str, Any]] = None
70 | ) -> str:
71 | # Render an open prompt using a list of demonstrations and a test example.
72 | return self.template.render(demonstrations=demonstrations,
73 | test_example=test_example)
74 |
--------------------------------------------------------------------------------
/templates/instruction_based_fs_template.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from templates.few_shot_template import FewShotTemplate
4 |
5 |
6 | class InstructionBasedFewShotTemplate(FewShotTemplate):
7 |
8 | def __init__(self,
9 | instruction: str,
10 | jinja2_file_path: Optional[str] = None,
11 | jinja2_string: Optional[str] = None):
12 | """
13 | Few shot template class supporting instructions.
14 |
15 | instruction: string containing the instruction.
16 | jinja2_file_path: path to a jinja2 template file.
17 | jinja2_string: string containing a jinja2 template.
18 |
19 | Accepts either a path to a jinja2 template file or a string containing the template.
20 |
21 | - The file must also define `dataset_name` which represents the name
22 | of the dataset, used in data.get_dataset.
23 | - To reference few shot examples, the template should use `demonstrations`.
24 | - To reference the test example, the template should use `test_example`.
25 | """
26 |
27 | if not (jinja2_file_path or jinja2_string):
28 | raise ValueError("Neither path to jinja2 template or string jinja2 template were provided.")
29 | elif jinja2_file_path and jinja2_string:
30 | raise ValueError("You only need to specify one of jinja2_file_path or jinja2_string not both.")
31 | elif jinja2_file_path:
32 | jinja2_string = open(jinja2_file_path, "r").read()
33 |
34 | if "{{instruction}}" not in jinja2_string:
35 | raise ValueError("Your prompt template must contain the placeholder {{instruction}}.")
36 |
37 | jinja2_string = jinja2_string.replace("{{instruction}}", instruction)
38 | super().__init__(jinja2_string=jinja2_string)
39 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | import argparse
3 | import hashlib
4 | import json
5 | import os
6 | import re
7 | import unicodedata
8 |
9 | from data import *
10 | from decoders import *
11 | from metrics import *
12 | from models import *
13 | from templates import *
14 |
15 |
16 | def slugify(value: Any, allow_unicode=False) -> str:
17 | value = str(value)
18 | if allow_unicode:
19 | value = unicodedata.normalize("NFKC", value)
20 | else:
21 | value = (unicodedata.normalize("NFKD",
22 | value).encode("ascii",
23 | "ignore").decode("ascii"))
24 | value = re.sub(r"[^\w\s-]", "", value.lower())
25 | return re.sub(r"[-\s]+", "-", value).strip("-_")
26 |
27 |
28 | def dict2namespace(config: dict) -> argparse.Namespace:
29 | namespace = argparse.Namespace()
30 | for key, value in config.items():
31 | if isinstance(value, dict):
32 | new_value = dict2namespace(value)
33 | else:
34 | new_value = value
35 | setattr(namespace, key, new_value)
36 | return namespace
37 |
38 | def hash_dict(dictionary: Dict[Any, Any]) -> str:
39 | dict_string = "\n".join([f"{key}: {value}" for key, value in dictionary.items()])
40 | sha = hashlib.sha256()
41 | sha.update(dict_string.encode())
42 | hashed_dict = sha.hexdigest()[:16]
43 | return hashed_dict
44 |
45 | def get_filename_from_metadata(metadata: Dict[str, Any]) -> str:
46 | hashed_metadata = hash_dict(metadata)
47 | return f"{hashed_metadata}.json"
48 |
49 | def write_results(results_dir: str, filename: str, metadata: Dict[str, Any], results: Dict[str, Any]) -> None:
50 | if not os.path.exists(results_dir):
51 | os.makedirs(results_dir)
52 | result_path = os.path.join(results_dir, filename)
53 | print(f"Writing results to {result_path}...")
54 |
55 | log_dict = {
56 | "metadata": metadata,
57 | "results": results
58 | }
59 |
60 | with open(result_path, "w", encoding='utf-8') as f:
61 | json.dump(log_dict, f, ensure_ascii=False, indent=4)
62 |
63 |
64 | def default_decoder_name(task_type: str) -> str:
65 | if task_type == "CLS":
66 | return "constrained_label_generation"
67 | elif task_type == "MCQ":
68 | return "constrained_per_example_label_generation"
69 | elif task_type == "GQA":
70 | return "greedy_generation"
71 | else:
72 | raise KeyError(f"Unrecognized task type {task_type}")
73 |
74 |
75 | def get_model(model_name: str) -> BaseModel:
76 | model_name = slugify(model_name)
77 | model_to_class_map = {
78 | "gptneo1b3": GPTNeo1B3,
79 | "gptneo2b7": GPTNeo2B7,
80 | "gptneox20b": GPTNeoX20B,
81 | "bloom1b1": Bloom1B1,
82 | "bloom1b7": Bloom1B7,
83 | "bloom3b": Bloom3B,
84 | "bloom7b1": Bloom7B1,
85 | "llama7b": LLaMA7B,
86 | "llama13b": LLaMA13B,
87 | "opt1b3": OPT1B3,
88 | "opt2b7": OPT2B7,
89 | "opt6b7": OPT6B7,
90 | "opt13b": OPT13B,
91 | "stablelmbase3b": StableLMBase3B,
92 | "stablelmbase7b": StableLMBase7B,
93 | "stablelmtuned3b": StableLMTuned3B,
94 | "stablelmtuned7b": StableLMTuned7B,
95 | }
96 | if model_name not in model_to_class_map:
97 | raise KeyError(f"Unrecognized model {model_name}")
98 |
99 | return model_to_class_map[model_name]()
100 |
101 |
102 | def get_decoder(decoder_name: str, template: FewShotTemplate, dataset: Dataset) -> Decoder:
103 | decoder_name = slugify(decoder_name)
104 | if decoder_name == "constrained_label_generation":
105 | return ConstrainedLabelGeneration(template)
106 | elif decoder_name == "nucleus_generation":
107 | return NucleusGeneration(template)
108 | elif decoder_name == "greedy_generation":
109 | return GreedyGeneration(template)
110 | elif decoder_name == "constrained_per_example_label_generation":
111 | return ConstrainedPerExampleLabelGeneration(template, dataset)
112 | else:
113 | raise KeyError("Unrecognized decoder {decoder_name}")
114 |
115 |
116 | def get_metric(
117 | metric_name: str,
118 | model: BaseModel,
119 | dataset: Dataset,
120 | template: FewShotTemplate,
121 | decoder: Decoder,
122 | metric_config: dict
123 | ) -> Metric:
124 | metric_name = slugify(metric_name)
125 | metric_to_class_map = {
126 | "zero_shot_accuracy": ZeroShotAccuracyMetric,
127 | "few_shot_accuracy": FewShotAccuracyMetric,
128 | "perturbational_accuracy": PerturbationalAccuracyMetric,
129 | "selectional_sensitivity": SelectionalSensitivityMetric,
130 | "permutational_sensitivity": PermutationalSensitivityMetric
131 | }
132 | if metric_name not in metric_to_class_map:
133 | raise KeyError(f"Unrecognized metric {metric_name}")
134 |
135 | metric_class = metric_to_class_map[metric_name]
136 | return metric_class(
137 | model=model,
138 | dataset=dataset,
139 | template=template,
140 | decoder=decoder,
141 | **metric_config[metric_name],
142 | )
143 |
--------------------------------------------------------------------------------