├── .DS_Store
├── .gitignore
├── LICENSE
├── README.md
├── eval
    ├── Coding
    │   ├── human_eval
    │   │   ├── data
    │   │   │   └── HumanEval.jsonl.gz
    │   │   └── evaluate_human_eval.py
    │   ├── leetcode
    │   │   ├── data.py
    │   │   ├── evaluate_leetcode.py
    │   │   ├── evaluation.py
    │   │   ├── execution.py
    │   │   ├── leetcode-test.json
    │   │   └── test.py
    │   └── mbpp
    │   │   ├── evaluate_mbpp.py
    │   │   └── new_mbpp.json
    ├── Ins-Following
    │   └── if_eval
    │   │   ├── evaluate_if_eval.py
    │   │   ├── evaluation_main.py
    │   │   ├── input_data.jsonl
    │   │   ├── instructions.py
    │   │   ├── instructions_registry.py
    │   │   └── instructions_util.py
    ├── Math
    │   ├── math
    │   │   ├── evaluate_math_cot.py
    │   │   ├── evaluate_math_pot.py
    │   │   └── math_test_cleaned.json
    │   ├── subset
    │   │   ├── data
    │   │   │   ├── SVAMP.json
    │   │   │   ├── asdiv.json
    │   │   │   └── gsmplus_test.json
    │   │   ├── evaluate_subset_cot.py
    │   │   └── evaluate_subset_pot.py
    │   └── theorem_qa
    │   │   ├── evaluate_theorem_qa_cot.py
    │   │   ├── evaluate_theorem_qa_pot.py
    │   │   ├── theorem_qa.json
    │   │   └── util.py
    ├── README.md
    ├── Reasoning
    │   └── bbh
    │   │   ├── evaluate_bbh.py
    │   │   └── test_prompts.json
    ├── mmlu
    │   ├── dev
    │   │   ├── abstract_algebra_dev.csv
    │   │   ├── anatomy_dev.csv
    │   │   ├── astronomy_dev.csv
    │   │   ├── business_ethics_dev.csv
    │   │   ├── clinical_knowledge_dev.csv
    │   │   ├── college_biology_dev.csv
    │   │   ├── college_chemistry_dev.csv
    │   │   ├── college_computer_science_dev.csv
    │   │   ├── college_mathematics_dev.csv
    │   │   ├── college_medicine_dev.csv
    │   │   ├── college_physics_dev.csv
    │   │   ├── computer_security_dev.csv
    │   │   ├── conceptual_physics_dev.csv
    │   │   ├── econometrics_dev.csv
    │   │   ├── electrical_engineering_dev.csv
    │   │   ├── elementary_mathematics_dev.csv
    │   │   ├── formal_logic_dev.csv
    │   │   ├── global_facts_dev.csv
    │   │   ├── high_school_biology_dev.csv
    │   │   ├── high_school_chemistry_dev.csv
    │   │   ├── high_school_computer_science_dev.csv
    │   │   ├── high_school_european_history_dev.csv
    │   │   ├── high_school_geography_dev.csv
    │   │   ├── high_school_government_and_politics_dev.csv
    │   │   ├── high_school_macroeconomics_dev.csv
    │   │   ├── high_school_mathematics_dev.csv
    │   │   ├── high_school_microeconomics_dev.csv
    │   │   ├── high_school_physics_dev.csv
    │   │   ├── high_school_psychology_dev.csv
    │   │   ├── high_school_statistics_dev.csv
    │   │   ├── high_school_us_history_dev.csv
    │   │   ├── high_school_world_history_dev.csv
    │   │   ├── human_aging_dev.csv
    │   │   ├── human_sexuality_dev.csv
    │   │   ├── international_law_dev.csv
    │   │   ├── jurisprudence_dev.csv
    │   │   ├── logical_fallacies_dev.csv
    │   │   ├── machine_learning_dev.csv
    │   │   ├── management_dev.csv
    │   │   ├── marketing_dev.csv
    │   │   ├── medical_genetics_dev.csv
    │   │   ├── miscellaneous_dev.csv
    │   │   ├── moral_disputes_dev.csv
    │   │   ├── moral_scenarios_dev.csv
    │   │   ├── nutrition_dev.csv
    │   │   ├── philosophy_dev.csv
    │   │   ├── prehistory_dev.csv
    │   │   ├── professional_accounting_dev.csv
    │   │   ├── professional_law_dev.csv
    │   │   ├── professional_medicine_dev.csv
    │   │   ├── professional_psychology_dev.csv
    │   │   ├── public_relations_dev.csv
    │   │   ├── security_studies_dev.csv
    │   │   ├── sociology_dev.csv
    │   │   ├── us_foreign_policy_dev.csv
    │   │   ├── virology_dev.csv
    │   │   └── world_religions_dev.csv
    │   ├── evaluate_mmlu.py
    │   ├── mmlu_test.json
    │   └── test
    │   │   ├── abstract_algebra_test.csv
    │   │   ├── anatomy_test.csv
    │   │   ├── astronomy_test.csv
    │   │   ├── business_ethics_test.csv
    │   │   ├── clinical_knowledge_test.csv
    │   │   ├── college_biology_test.csv
    │   │   ├── college_chemistry_test.csv
    │   │   ├── college_computer_science_test.csv
    │   │   ├── college_mathematics_test.csv
    │   │   ├── college_medicine_test.csv
    │   │   ├── college_physics_test.csv
    │   │   ├── computer_security_test.csv
    │   │   ├── conceptual_physics_test.csv
    │   │   ├── econometrics_test.csv
    │   │   ├── electrical_engineering_test.csv
    │   │   ├── elementary_mathematics_test.csv
    │   │   ├── formal_logic_test.csv
    │   │   ├── global_facts_test.csv
    │   │   ├── high_school_biology_test.csv
    │   │   ├── high_school_chemistry_test.csv
    │   │   ├── high_school_computer_science_test.csv
    │   │   ├── high_school_european_history_test.csv
    │   │   ├── high_school_geography_test.csv
    │   │   ├── high_school_government_and_politics_test.csv
    │   │   ├── high_school_macroeconomics_test.csv
    │   │   ├── high_school_mathematics_test.csv
    │   │   ├── high_school_microeconomics_test.csv
    │   │   ├── high_school_physics_test.csv
    │   │   ├── high_school_psychology_test.csv
    │   │   ├── high_school_statistics_test.csv
    │   │   ├── high_school_us_history_test.csv
    │   │   ├── high_school_world_history_test.csv
    │   │   ├── human_aging_test.csv
    │   │   ├── human_sexuality_test.csv
    │   │   ├── international_law_test.csv
    │   │   ├── jurisprudence_test.csv
    │   │   ├── logical_fallacies_test.csv
    │   │   ├── machine_learning_test.csv
    │   │   ├── management_test.csv
    │   │   ├── marketing_test.csv
    │   │   ├── medical_genetics_test.csv
    │   │   ├── miscellaneous_test.csv
    │   │   ├── moral_disputes_test.csv
    │   │   ├── moral_scenarios_test.csv
    │   │   ├── nutrition_test.csv
    │   │   ├── philosophy_test.csv
    │   │   ├── prehistory_test.csv
    │   │   ├── professional_accounting_test.csv
    │   │   ├── professional_law_test.csv
    │   │   ├── professional_medicine_test.csv
    │   │   ├── professional_psychology_test.csv
    │   │   ├── public_relations_test.csv
    │   │   ├── security_studies_test.csv
    │   │   ├── sociology_test.csv
    │   │   ├── us_foreign_policy_test.csv
    │   │   ├── virology_test.csv
    │   │   └── world_religions_test.csv
    ├── requirements.txt
    ├── run.sh
    └── utils
    │   ├── __init__.py
    │   ├── data.py
    │   ├── evaluation.py
    │   ├── execution.py
    │   ├── grader.py
    │   ├── math_equivalence.py
    │   ├── math_normalize.py
    │   ├── python_interpreter.py
    │   └── util.py
├── figures
    ├── .DS_Store
    ├── Eurus-logo.png
    ├── lc_tqa.png
    ├── leetcode_vs_theoremqa-1.png
    ├── leetcode_vs_theoremqa-2.png
    ├── main_exp.png
    ├── rm_exp.png
    ├── rm_loss.png
    ├── stats.png
    ├── tree-1.png
    ├── tree.png
    ├── ui-1.png
    └── ui_example.png
└── paper.pdf


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/eval/Coding/human_eval/data/HumanEval.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/eval/Coding/human_eval/data/HumanEval.jsonl.gz


--------------------------------------------------------------------------------
/eval/Coding/human_eval/evaluate_human_eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import argparse
  4 | import pandas as pd
  5 | from datasets import Dataset
  6 | 
  7 | 
  8 | 
  9 | 
 10 | import sys
 11 | sys.path.append("../..")
 12 | from utils.data import write_jsonl, read_problems, HUMAN_EVAL
 13 | from utils.evaluation import evaluate_functional_correctness
 14 | 
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 18 | parser.add_argument("--save_dir",default="./" ,type=str)
 19 | parser.add_argument("--num-samples-per-task", type=int, default=1)
 20 | parser.add_argument("--model_type", type=str, default='mistral')
 21 | # for pass@1
 22 | # https://github.com/bigcode-project/bigcode-evaluation-harness/blob/c326b51eef25f96ca9b8d22300612b64f3253992/docs/README.md?plain=1#L44
 23 | parser.add_argument("--temperature", type=float, default=0.2)
 24 | args = parser.parse_args()
 25 | 
 26 | problems = read_problems()
 27 | # https://github.com/bigcode-project/bigcode-evaluation-harness/blob/c326b51eef25f96ca9b8d22300612b64f3253992/bigcode_eval/tasks/humaneval.py#L54C13-L54C87
 28 | STOP_WORDS =["\nassert", "assert"]
 29 | 
 30 | from vllm import LLM, SamplingParams
 31 | import torch
 32 | 
 33 | def generate_sample_batch(question_list):
 34 |     llm = LLM(
 35 |         model=args.model,
 36 |         trust_remote_code=True,
 37 |         tensor_parallel_size=torch.cuda.device_count()
 38 |     )
 39 |     global EOS_TOKEN
 40 |     sampling_params = SamplingParams(max_tokens=1024,
 41 |                                     temperature=args.temperature,
 42 |                                     n=1,
 43 |                                     stop=[EOS_TOKEN],)
 44 |     
 45 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 46 |     #completions = [match_code(output.outputs[0].text) for output in outputs]
 47 |     completions = [output.outputs[0].text.split('```')[0] for output in outputs]
 48 |     return completions
 49 | 
 50 | def make_signature(example):
 51 |     signature = re.search(
 52 |                 rf"def\s+({example['entry_point']}.*?):\s*\n", example["prompt"]
 53 |             ).group(1)
 54 |     return signature
 55 | 
 56 | from transformers import AutoTokenizer
 57 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 58 | global EOS_TOKEN
 59 | EOS_TOKEN = tokenizer.eos_token
 60 | def make_conv(example):
 61 |     signature = re.search(
 62 |                 rf"def\s+({example['entry_point']}.*?):\s*\n", example["prompt"]
 63 |             ).group(1)
 64 |     description = "\n".join(
 65 |                 [
 66 |                     line.strip()
 67 |                     for line in re.search(
 68 |                         rf"(?:\"\"\"|''')(.*?)(?:\"\"\"|''')", example["prompt"], re.DOTALL
 69 |                     )
 70 |                     .group(1)
 71 |                     .split("\n")
 72 |                 ]
 73 |             )
 74 |     prompt = (
 75 |                 f"Write Python code to solve the task.\n"
 76 |                 f"Write a Python function `{signature}` to solve the following problem: Present code in ```python```\n"
 77 |                 f"```python\n"
 78 |                 # f"{description}\n"
 79 |                 f"{example['prompt']}"
 80 |                 f"```\n"
 81 |             )
 82 | 
 83 |     msg =  [{"role": "user", "content": prompt}]
 84 |     # if "eurus-70b" not in args.model.lower():
 85 |     #msg.append({"role": "assistant", "content": "```Python\n"})
 86 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 87 |     # else:
 88 |     #      out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
 89 |     #out = out.rstrip(tokenizer.eos_token).strip()
 90 |     return out + " ```python\ndef"
 91 |     # return out
 92 | 
 93 | 
 94 | 
 95 | def entry_point(
 96 |     sample_file: str,
 97 |     k: str = "1,10,100",
 98 |     n_workers: int = 4,
 99 |     timeout: float = 3.0,
100 |     problem_file: str = HUMAN_EVAL,
101 | ):
102 |     """
103 |     Evaluates the functional correctness of generated samples, and writes
104 |     results to f"{sample_file}_results.jsonl.gz"
105 |     """
106 |     k = list(map(int, k.split(",")))
107 |     results = evaluate_functional_correctness(sample_file, k=k, n_workers=n_workers, timeout=timeout, problem_file=problem_file)
108 |     results = {k:v*100 for k,v in results.items()}
109 |     print(results)
110 | 
111 | 
112 | samples = []
113 | problems = Dataset.from_pandas(pd.DataFrame(problems).T)
114 | problems = problems.map(lambda x: {"signature": make_signature(x)}, cache_file_name="../../cache/human_eval", load_from_cache_file=False)
115 | problems = problems.map(lambda x: {"instruction": make_conv(x)}, cache_file_name="../../cache/human_eval", load_from_cache_file=False)
116 | 
117 | completions = generate_sample_batch(problems["instruction"])
118 | problems = problems.add_column("completion", completions)
119 | problems = problems.map(lambda x: {"completion": "def "+ x["completion"]})
120 | # problems = problems.map(lambda x: {"completion": x["prompt"] + x["completion"]})
121 | samples = problems.to_pandas().to_dict(orient="records")
122 | 
123 | os.makedirs(args.save_dir, exist_ok=True)
124 | output_filepath = os.path.join(args.save_dir, "samples.jsonl")
125 | write_jsonl(output_filepath, samples)
126 | 
127 | entry_point(output_filepath)
128 | 
129 | 


--------------------------------------------------------------------------------
/eval/Coding/leetcode/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | 
 7 | ROOT = os.path.dirname(os.path.abspath(__file__))
 8 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
 9 | 
10 | 
11 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
12 |     return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
13 | 
14 | 
15 | def stream_jsonl(filename: str) -> Iterable[Dict]:
16 |     """
17 |     Parses each jsonl line and yields it as a dictionary
18 |     """
19 |     if filename.endswith(".gz"):
20 |         with open(filename, "rb") as gzfp:
21 |             with gzip.open(gzfp, 'rt') as fp:
22 |                 for line in fp:
23 |                     if any(not x.isspace() for x in line):
24 |                         yield json.loads(line)
25 |     else:
26 |         with open(filename, "r") as fp:
27 |             for line in fp:
28 |                 if any(not x.isspace() for x in line):
29 |                     yield json.loads(line)
30 | 
31 | 
32 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
33 |     """
34 |     Writes an iterable of dictionaries to jsonl
35 |     """
36 |     if append:
37 |         mode = 'ab'
38 |     else:
39 |         mode = 'wb'
40 |     filename = os.path.expanduser(filename)
41 |     if filename.endswith(".gz"):
42 |         with open(filename, mode) as fp:
43 |             with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
44 |                 for x in data:
45 |                     gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46 |     else:
47 |         with open(filename, mode) as fp:
48 |             for x in data:
49 |                 fp.write((json.dumps(x) + "\n").encode('utf-8'))
50 | 


--------------------------------------------------------------------------------
/eval/Coding/leetcode/evaluate_leetcode.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import time
  4 | import openai
  5 | import argparse
  6 | import traceback
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | from typing import List
 10 | from datasets import Dataset
 11 | import json
 12 | 
 13 | import sys
 14 | sys.path.append("../..")
 15 | from utils.data import write_jsonl
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 19 | parser.add_argument("--save_dir", type=str,default="./")
 20 | parser.add_argument("--num-samples-per-task", type=int, default=1)
 21 | parser.add_argument("--model_type", type=str, default='mistral')
 22 | # for pass@1
 23 | # https://github.com/bigcode-project/bigcode-evaluation-harness/blob/c326b51eef25f96ca9b8d22300612b64f3253992/docs/README.md?plain=1#L44
 24 | parser.add_argument("--temperature", type=float, default=0.)
 25 | args = parser.parse_args()
 26 | 
 27 | problems = pd.read_json("leetcode-test.json", lines=True)
 28 | # https://github.com/bigcode-project/bigcode-evaluation-harness/blob/c326b51eef25f96ca9b8d22300612b64f3253992/bigcode_eval/tasks/humaneval.py#L54C13-L54C87
 29 | STOP_WORDS =["\nassert", "assert"]
 30 | 
 31 | from vllm import LLM, SamplingParams
 32 | import torch
 33 | 
 34 | def match_code(s):
 35 |     pattern = r'```python(.*?)```'
 36 |     sol = re.findall(pattern, s, re.DOTALL)
 37 |     if len(sol) > 0:
 38 |         return sol[0]
 39 |     
 40 |     pattern = r'```(.*?)```'
 41 |     sol = re.findall(pattern, s, re.DOTALL)
 42 |     if len(sol) > 0:
 43 |         return sol[0]
 44 |     
 45 |     return s.split('```')[0]
 46 | 
 47 | def generate_sample_batch(question_list):
 48 |     llm = LLM(
 49 |         model=args.model,
 50 |         trust_remote_code=True,
 51 |         tensor_parallel_size=torch.cuda.device_count(),
 52 |     )
 53 |     global EOS_TOKEN
 54 |     sampling_params = SamplingParams(max_tokens=1024,
 55 |                                     temperature=args.temperature,
 56 |                                     n=1,
 57 |                                     stop=[EOS_TOKEN],)
 58 |     
 59 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 60 |     completions = ["```python\n" +  match_code(output.outputs[0].text) + "\n```" for output in outputs]
 61 |     return completions
 62 | 
 63 | # from transformers import AutoTokenizer
 64 | # tokenizer = AutoTokenizer.from_pretrained(args.model)
 65 | # def make_conv(example):
 66 | #     prompt = example["prompt_sft"] + "\nYou need first write a step-by-step outline and then write the code."
 67 | #     msg =  [{"role": "user", "content": prompt},]
 68 | #     msg.append({"role": "assistant", "content": "```python\n" if "eurus-70b" not in args.model.lower() else ""})
 69 | 
 70 | #     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
 71 | #     out = out.rstrip(tokenizer.eos_token).strip()
 72 | #     return out
 73 | 
 74 | from transformers import AutoTokenizer
 75 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 76 | global EOS_TOKEN
 77 | EOS_TOKEN = tokenizer.eos_token
 78 | def make_conv_hf(example, tokenizer):
 79 |     # msg = [
 80 |     #     {"role": "user", "content": example["prompt_sft"] + "\nYou need first write a step-by-step outline and then write the code."}
 81 |     # ]
 82 |     msg = [
 83 |         {"role": "user", "content": example["prompt_sft"]}
 84 |     ]
 85 |     chat = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 86 |     return chat + "```python"
 87 | 
 88 | 
 89 | samples = []
 90 | del problems["start_time"]
 91 | problems["instruction"] = problems.apply(lambda row: make_conv_hf(row, tokenizer), axis=1)
 92 | 
 93 | completions = generate_sample_batch(problems["instruction"])
 94 | problems["output"] = completions
 95 | 
 96 | #problems.to_json(os.path.join(args.save_dir, "completions.jsonl"), orient="records", indent=4)
 97 | 
 98 | samples = problems.to_dict(orient="records")
 99 | 
100 | output_filepath = os.path.join(args.save_dir, "samples.jsonl")
101 | write_jsonl(output_filepath, samples)
102 | 


--------------------------------------------------------------------------------
/eval/Coding/leetcode/test.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import json
 4 | from pathlib import Path
 5 | from collections import defaultdict
 6 | from evaluation import evaluate_functional_correctness
 7 | 
 8 | version = "20240121-Jul"
 9 | 
10 | DATA_DIR = Path(__file__).parent / "data"
11 | 
12 | def extract_python_code(generation: str):
13 |     generation = generation.replace("[PYTHON]", '```python').replace("[/PYTHON]", '```')
14 |     if '```python' in generation:
15 |         p_code = re.compile(r'```python\n(.*?)\n```', flags=re.DOTALL)
16 |         code_block = p_code.findall(generation)[0]
17 |         return code_block
18 |     else:
19 |         codelist = re.split("\ndef|\nclass|\nif|\n#|\nprint", generation)
20 |         return codelist[0]
21 |     
22 | def evaluate_main(generation_path: str, result_path: str, temp_dir: str):
23 |     problem_path = "leetcode-test.json"
24 |     result_path = os.path.join(result_path, "samples.jsonl")
25 |     print(problem_path)
26 |     problems = [json.loads(line) for line in open(problem_path, 'r')]
27 | 
28 |     id2problems = { x['task_id']: x for x in problems }
29 | 
30 |     results = [json.loads(line) for line in open(generation_path, 'r')]
31 |     for result in results:
32 |         if 'task_id' not in result:
33 |             result['task_id'] = problems[result['index']]['task_id']
34 | 
35 |         if 'generation' not in result:
36 |             try:
37 |                 if 'output' not in result:
38 |                     result['output'] = result['response']
39 |                 if result['output'].startswith("\n        "):
40 |                     func_code = extract_python_code(result['prompt_sft']).strip()
41 |                     result['generation'] = func_code + '\n' + result['output']
42 |                 else:
43 |                     result['generation'] = extract_python_code(result['output'])
44 |             except:
45 |                 result['generation'] = result['output']
46 |     
47 |     with open(result_path, 'w') as fr:
48 |         for result in results:
49 |             fr.write(json.dumps(result) + "\n")
50 | 
51 |     score = evaluate_functional_correctness(
52 |         input_file=result_path,
53 |         tmp_dir=temp_dir,
54 |         problem_file=problem_path,
55 |         result_path=result_path
56 |     )
57 | 
58 |     hardness_results = defaultdict(int)
59 |     for result in [json.loads(line) for line in open(result_path, 'r')]:
60 |         problem = id2problems[result['task_id']]
61 | 
62 |         hardness = problem['meta']['difficulty']
63 |         hardness_results[hardness] += 1
64 |         hardness_results[hardness + "_correct"] += result['passed']
65 | 
66 |     print("="*100)
67 |     print("Evaluate {} over.".format(generation_path))
68 |     print("Pass@1: {:.3f}".format(score["pass@1"]))
69 |     for key in ["Easy", "Medium", "Hard"]:
70 |         if key.endswith("_correct"):
71 |             continue
72 |         acc = hardness_results[key+"_correct"] / hardness_results[key]
73 |         print("{}: {:.3f}({}/{})".format(key, acc, hardness_results[key+"_correct"],  hardness_results[key]))
74 | 
75 | if __name__ == '__main__':
76 |     import argparse
77 |     parser = argparse.ArgumentParser()
78 |     parser.add_argument("--generation_path", type=str, required=True)
79 |     parser.add_argument("--result_path", type=str)
80 |     parser.add_argument("--temp_dir", type=str, default="output/temp")
81 |     args = parser.parse_args()
82 | 
83 |     if args.result_path is None:
84 |         args.result_path = args.generation_path.replace(".jsonl", "_result.jsonl")
85 |     output_filepath = os.path.join(args.generation_path, "samples.jsonl")
86 |     evaluate_main(output_filepath, args.result_path, temp_dir=args.temp_dir)
87 |     pass
88 | 


--------------------------------------------------------------------------------
/eval/Coding/mbpp/evaluate_mbpp.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datasets import Dataset
  3 | import pandas as pd
  4 | import torch
  5 | from tqdm import tqdm
  6 | import os
  7 | import torch
  8 | import openai
  9 | import argparse
 10 | from vllm import LLM, SamplingParams
 11 | import time
 12 | import re
 13 | 
 14 | os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
 15 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 16 | 
 17 | import sys
 18 | sys.path.append("./scripts/eval")
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 22 | parser.add_argument("--input_data", type=str, default="./new_mbpp.json")
 23 | parser.add_argument("--save_dir", type=str, default="./")
 24 | parser.add_argument("--model_type", type=str, default='mistral')
 25 | 
 26 | args = parser.parse_args()
 27 | 
 28 | os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
 29 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 30 | 
 31 | STOP_WORDS =["\nassert", "assert", "\ndef "]
 32 | 
 33 | def match_code(s):
 34 |     pattern = r'```python(.*?)```'
 35 |     sol = re.findall(pattern, s, re.DOTALL)
 36 |     if len(sol) > 0:
 37 |         return sol[0]
 38 |     
 39 |     pattern = r'```(.*?)```'
 40 |     sol = re.findall(pattern, s, re.DOTALL)
 41 |     if len(sol) > 0:
 42 |         return sol[0]
 43 |     
 44 |     return s.split('```')[0]
 45 | 
 46 | def generate_sample_batch(question_list):
 47 |     llm = LLM(
 48 |         model=args.model,
 49 |         trust_remote_code=True,
 50 |         tensor_parallel_size=torch.cuda.device_count(),
 51 |     )
 52 |     sampling_params = SamplingParams(max_tokens=512,
 53 |                                     temperature=0.0,
 54 |                                     n=1,
 55 |                                     stop=STOP_WORDS)
 56 | 
 57 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 58 |     completions = []
 59 |     for completion in [output.outputs[0].text.split('```')[0] for output in outputs]:
 60 |         completion = completion.split('```')[0]
 61 |         completions.append(completion)
 62 |     return completions
 63 | 
 64 | def make_signature(code):
 65 |     signature = [line for line in code.split("\n") if line.strip().startswith("def ")][0]
 66 |     signature = signature.lstrip("def ").replace(" ", "").rstrip(":").strip().replace(",", ", ")
 67 |     assert ":" not in signature
 68 |     return signature
 69 | 
 70 | from transformers import AutoTokenizer
 71 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 72 | def make_conv(signature, description, test_list):
 73 |     description = description.split(" https://www.")[0]
 74 |     #testcase = "\n>>> ".join(test_list)
 75 |     testcase = test_list[0]
 76 |     prompt = (
 77 |                 f"Write Python code to solve the task.\n"
 78 |                 f"Write a Python function `{signature}` to solve the following problem: Present code in ```python```\n"
 79 |                 #f"```python\n"
 80 |                 f"{description}\n"
 81 |                 f">>> {testcase}\n"
 82 |                 #f"```\n"
 83 |             )
 84 |     msg =  [{"role": "user", "content": prompt}]
 85 |     # if "eurus-70b" not in args.model.lower():
 86 |     #msg.append({"role": "assistant", "content": " ```python\ndef"})
 87 |     #     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
 88 |     # else:
 89 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 90 |     # out = out.lstrip(tokenizer.bos_token).strip()
 91 |     # out = out.rstrip(tokenizer.eos_token).strip()
 92 |     return out+"```python\ndef"
 93 | 
 94 | import contextlib
 95 | import signal
 96 | class TimeoutException(Exception):
 97 |     pass
 98 | @contextlib.contextmanager
 99 | def time_limit(seconds: float):
100 |     def signal_handler(signum, frame):
101 |         raise TimeoutException("Timed out!")
102 |     signal.setitimer(signal.ITIMER_REAL, seconds)
103 |     signal.signal(signal.SIGALRM, signal_handler)
104 |     try:
105 |         yield
106 |     finally:
107 |         signal.setitimer(signal.ITIMER_REAL, 0)
108 | 
109 | def exec_helper(code):
110 |     with time_limit(3):
111 |         exec(compile(code, filename="mbpp", mode='exec'), globals())
112 | 
113 | def evaluate(dataset):
114 |     correct = 0
115 |     format_error = 0
116 |     exec_error = 0
117 | 
118 |     for example in dataset.to_dict(orient="records"):
119 |         completion = example["completion"]
120 |         # remove texts
121 |         code = completion.split("\n")
122 |         code_ = []
123 |         for c in code:
124 |             if len(c.lstrip()) == len(c) and not c.startswith("def"):
125 |                 continue
126 |             code_.append(c)
127 |         code = "\n".join(code_)
128 | 
129 |         function = code
130 |         test_cases = "\n".join(example["test_list"]).replace("\/", "/")
131 |         test_run = "\n".join([
132 |             function,
133 |             test_cases,
134 |         ])
135 | 
136 |         # define function
137 |         try:
138 |             exec_helper(function)
139 |         except Exception as e:
140 |             format_error += 1
141 |             continue           
142 | 
143 |         try:
144 |             # run test case
145 |             exec_helper(test_cases)
146 |             exec_helper(test_run)
147 |         except:
148 |             exec_error += 1
149 |             continue
150 |         else:
151 |             correct += 1
152 |     return 100 * correct / len(dataset), 100 * exec_error / len(dataset), 100 * format_error / len(dataset)
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     
157 | 
158 |     dataset = pd.read_json(args.input_data, lines=False)
159 |     dataset["signature"] = dataset.apply(lambda row: make_signature(row["code"]), axis=1)
160 |     for signature in dataset["signature"]:
161 |         STOP_WORDS.append("\n\nprint(" + signature.split("(")[0].strip())
162 |     dataset["prompt"] = dataset.apply(lambda row: make_conv(row["signature"], row["prompt"], row["test_list"]), axis=1)
163 |     completions = generate_sample_batch(dataset["prompt"].tolist())
164 |     dataset["completion"] = completions
165 |     del dataset["source_file"]
166 |     dataset["completion"] = dataset.apply(lambda row: "def" + row["completion"] if "def" not in row["completion"] else row["completion"], axis=1)
167 |     dataset.to_json(os.path.join(args.save_dir, "mbpp_completion.json"))
168 | 
169 | 
170 |     accuracy, exec_error, format_error = evaluate(dataset)
171 |     
172 |     with open(os.path.join(args.save_dir, "result.txt"), "w") as f:
173 |         print({"accuracy": accuracy, "exec_error": exec_error, "format_error": format_error})
174 |         print({"accuracy": accuracy, "exec_error": exec_error, "format_error": format_error}, file=f)
175 | 


--------------------------------------------------------------------------------
/eval/Ins-Following/if_eval/evaluate_if_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import json
 4 | import openai
 5 | import argparse
 6 | import traceback
 7 | import pandas as pd
 8 | from tqdm import tqdm
 9 | from typing import List
10 | from datasets import Dataset
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
14 | parser.add_argument("--input_data", type=str, default="./input_data.jsonl")
15 | parser.add_argument("--save_path", type=str, default="./input_response_data.jsonl")
16 | parser.add_argument("--model_type", type=str, default='mistral')
17 | 
18 | args = parser.parse_args()
19 | 
20 | 
21 | from vllm import LLM, SamplingParams
22 | import torch
23 | def generate_sample_batch(question_list):
24 |     llm = LLM(
25 |         model=args.model,
26 |         trust_remote_code=True,
27 |         tensor_parallel_size=torch.cuda.device_count()
28 |     )
29 |     sampling_params = SamplingParams(max_tokens=1024,
30 |                                     temperature=0.7,
31 |                                     n=1,
32 |                                     stop=["Question:"],)
33 |     
34 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
35 |     completions = [output.outputs[0].text.strip() for output in outputs]
36 |     return completions
37 | 
38 | from transformers import AutoTokenizer
39 | tokenizer = AutoTokenizer.from_pretrained(args.model)
40 | def make_conv(question):
41 |     msg =  [{"role": "user", "content": question},]
42 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
43 |     out = out.lstrip(tokenizer.bos_token)
44 |     return out
45 | 
46 | if __name__ == "__main__":
47 |     
48 | 
49 |     dataset = pd.read_json(args.input_data, lines=True)
50 |     dataset = Dataset.from_pandas(dataset)
51 |     dataset = dataset.map(lambda x: {"instruction": make_conv(x["prompt"])}, cache_file_name="../../cache/if_eval", load_from_cache_file=False)
52 |     completions = generate_sample_batch(dataset["instruction"])
53 |     dataset = dataset.add_column("response", completions)
54 | 
55 |     dataset = dataset.to_json(args.save_path)
56 | 


--------------------------------------------------------------------------------
/eval/Ins-Following/if_eval/instructions_registry.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Registry of all instructions."""
 17 | import instructions
 18 | 
 19 | _KEYWORD = "keywords:"
 20 | 
 21 | _LANGUAGE = "language:"
 22 | 
 23 | _LENGTH = "length_constraints:"
 24 | 
 25 | _CONTENT = "detectable_content:"
 26 | 
 27 | _FORMAT = "detectable_format:"
 28 | 
 29 | _MULTITURN = "multi-turn:"
 30 | 
 31 | _COMBINATION = "combination:"
 32 | 
 33 | _STARTEND = "startend:"
 34 | 
 35 | _CHANGE_CASES = "change_case:"
 36 | 
 37 | _PUNCTUATION = "punctuation:"
 38 | 
 39 | INSTRUCTION_DICT = {
 40 |     _KEYWORD + "existence": instructions.KeywordChecker,
 41 |     _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
 42 |     # TODO(jeffreyzhou): make a proper set of sentences to choose from
 43 |     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
 44 |     _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
 45 |     _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
 46 |     _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
 47 |     _LENGTH + "number_sentences": instructions.NumberOfSentences,
 48 |     _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
 49 |     _LENGTH + "number_words": instructions.NumberOfWords,
 50 |     _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
 51 |     _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
 52 |     _CONTENT + "postscript": instructions.PostscriptChecker,
 53 |     _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
 54 |     # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
 55 |     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
 56 |     _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
 57 |     _FORMAT + "number_highlighted_sections": (
 58 |         instructions.HighlightSectionChecker),
 59 |     _FORMAT + "multiple_sections": instructions.SectionChecker,
 60 |     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
 61 |     # _FORMAT + "rephrase": instructions.RephraseChecker,
 62 |     _FORMAT + "json_format": instructions.JsonFormat,
 63 |     _FORMAT + "title": instructions.TitleChecker,
 64 |     # TODO(tianjianlu): Re-enable with specific prompts.
 65 |     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
 66 |     _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
 67 |     _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
 68 |     _STARTEND + "end_checker": instructions.EndChecker,
 69 |     _CHANGE_CASES
 70 |     + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
 71 |     _CHANGE_CASES
 72 |     + "english_capital": instructions.CapitalLettersEnglishChecker,
 73 |     _CHANGE_CASES
 74 |     + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
 75 |     _PUNCTUATION + "no_comma": instructions.CommaChecker,
 76 |     _STARTEND + "quotation": instructions.QuotationChecker,
 77 | }
 78 | 
 79 | INSTRUCTION_CONFLICTS = {
 80 |     _KEYWORD + "existence": {_KEYWORD + "existence"},
 81 |     _KEYWORD + "frequency": {_KEYWORD + "frequency"},
 82 |     # TODO(jeffreyzhou): make a proper set of sentences to choose from
 83 |     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
 84 |     _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
 85 |     _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
 86 |     _LANGUAGE
 87 |     + "response_language": {
 88 |         _LANGUAGE + "response_language",
 89 |         _FORMAT + "multiple_sections",
 90 |         _KEYWORD + "existence",
 91 |         _KEYWORD + "frequency",
 92 |         _KEYWORD + "forbidden_words",
 93 |         _STARTEND + "end_checker",
 94 |         _CHANGE_CASES + "english_capital",
 95 |         _CHANGE_CASES + "english_lowercase",
 96 |     },
 97 |     _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
 98 |     _LENGTH + "number_paragraphs": {
 99 |         _LENGTH + "number_paragraphs",
100 |         _LENGTH + "nth_paragraph_first_word",
101 |         _LENGTH + "number_sentences",
102 |         _LENGTH + "nth_paragraph_first_word",
103 |     },
104 |     _LENGTH + "number_words": {_LENGTH + "number_words"},
105 |     _LENGTH + "nth_paragraph_first_word": {
106 |         _LENGTH + "nth_paragraph_first_word",
107 |         _LENGTH + "number_paragraphs",
108 |     },
109 |     _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
110 |     _CONTENT + "postscript": {_CONTENT + "postscript"},
111 |     _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
112 |     # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
113 |     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
114 |     _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
115 |     _FORMAT
116 |     + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
117 |     _FORMAT
118 |     + "multiple_sections": {
119 |         _FORMAT + "multiple_sections",
120 |         _LANGUAGE + "response_language",
121 |         _FORMAT + "number_highlighted_sections",
122 |     },
123 |     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
124 |     # _FORMAT + "rephrase": instructions.RephraseChecker,
125 |     _FORMAT
126 |     + "json_format": set(INSTRUCTION_DICT.keys()).difference(
127 |         {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
128 |     ),
129 |     _FORMAT + "title": {_FORMAT + "title"},
130 |     # TODO(tianjianlu): Re-enable with specific prompts.
131 |     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
132 |     _COMBINATION
133 |     + "two_responses": set(INSTRUCTION_DICT.keys()).difference({
134 |         _KEYWORD + "forbidden_words",
135 |         _KEYWORD + "existence",
136 |         _LANGUAGE + "response_language",
137 |         _FORMAT + "title",
138 |         _PUNCTUATION + "no_comma"
139 |     }),
140 |     _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference({
141 |         _KEYWORD + "existence",
142 |         _FORMAT + "title",
143 |         _PUNCTUATION + "no_comma"
144 |     }),
145 |     _STARTEND + "end_checker": {_STARTEND + "end_checker"},
146 |     _CHANGE_CASES + "capital_word_frequency": {
147 |         _CHANGE_CASES + "capital_word_frequency",
148 |         _CHANGE_CASES + "english_lowercase",
149 |         _CHANGE_CASES + "english_capital",
150 |     },
151 |     _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
152 |     _CHANGE_CASES + "english_lowercase": {
153 |         _CHANGE_CASES + "english_lowercase",
154 |         _CHANGE_CASES + "english_capital",
155 |     },
156 |     _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
157 |     _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
158 | }
159 | 
160 | 
161 | def conflict_make(conflicts):
162 |   """Makes sure if A conflicts with B, B will conflict with A.
163 | 
164 |   Args:
165 |     conflicts: Dictionary of potential conflicts where key is instruction id
166 |       and value is set of instruction ids that it conflicts with.
167 | 
168 |   Returns:
169 |     Revised version of the dictionary. All instructions conflict with
170 |     themselves. If A conflicts with B, B will conflict with A.
171 |   """
172 |   for key in conflicts:
173 |     for k in conflicts[key]:
174 |       conflicts[k].add(key)
175 |     conflicts[key].add(key)
176 |   return conflicts


--------------------------------------------------------------------------------
/eval/Math/math/evaluate_math_cot.py:
--------------------------------------------------------------------------------
  1 | # Adapt from https://github.com/hendrycks/math/blob/main/modeling/evaluate_gpt3.py
  2 | 
  3 | import os
  4 | import time
  5 | import traceback
  6 | import openai
  7 | import argparse
  8 | import numpy as np
  9 | import operator
 10 | import json
 11 | import tqdm
 12 | import pandas as pd
 13 | from collections import defaultdict
 14 | from vllm import LLM, SamplingParams
 15 | import torch
 16 | import re
 17 | 
 18 | import math
 19 | 
 20 | 
 21 | parser = argparse.ArgumentParser()
 22 | parser.add_argument("--data_dir", "-d", type=str, default="./")
 23 | parser.add_argument("--save_dir", "-s", type=str, default="./")
 24 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 25 | parser.add_argument("--model_type", type=str, default='mistral')
 26 | args = parser.parse_args()
 27 | 
 28 | 
 29 | import sys
 30 | sys.path.append("../..")
 31 | from utils import evaluate_math
 32 | 
 33 | os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
 34 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 35 | 
 36 | def generate_sample_batch(question_list):
 37 |     llm = LLM(
 38 |         model=args.model,
 39 |         trust_remote_code=True,
 40 |         tensor_parallel_size=torch.cuda.device_count(),
 41 |     )
 42 |     sampling_params = SamplingParams(max_tokens=1024,
 43 |                                     temperature=0,
 44 |                                     stop=["\n###\nProblem: "],)
 45 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 46 |     completions = [output.outputs[0].text for output in outputs]
 47 |     return completions
 48 | 
 49 | from transformers import AutoTokenizer
 50 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 51 | def make_conv(question, model_type):
 52 |     prompt = "Solve the following math problem step-by-step.\n" + "Simplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}.\n" + question
 53 |     # add question
 54 |     msg =  [{"role": "user", "content": prompt},]
 55 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 56 |     return out
 57 | 
 58 |     
 59 | 
 60 | def run(args, max=-1):
 61 |     outputs = []
 62 |     answers = []
 63 |     types = []
 64 |     levels = []
 65 |     matches = []
 66 |     fnames_list = []
 67 | 
 68 |     cors = {}
 69 |     subject_cors = {}
 70 |     level_cors = {}
 71 |     correct = 0
 72 |     total = 0
 73 | 
 74 |     
 75 |     all_problems = pd.read_json(os.path.join(args.data_dir, "math_test_cleaned.json")).to_dict(orient="records")
 76 |     completions = generate_sample_batch([make_conv(problem_data["problem"], args.model_type) for problem_data in all_problems])
 77 | 
 78 | 
 79 |     for problem_data, model_output in zip(all_problems, completions):
 80 | 
 81 |         prob_level = problem_data["level"]
 82 |         prob_type = problem_data["type"]
 83 |         try:
 84 |             prob_level = int(prob_level.split("Level ")[1])
 85 |         except:
 86 |             prob_level = None
 87 | 
 88 |         answer = problem_data["expected_answer"]
 89 | 
 90 |         levels.append(prob_level)
 91 |         types.append(prob_type)
 92 |         is_matched, equiv, model_output = evaluate_math(model_output, answer)
 93 |         matches.append(is_matched)
 94 |         outputs.append(model_output)
 95 |         answers.append(answer)
 96 | 
 97 |         fnames_list.append(equiv)
 98 |         if (prob_level, prob_type) in cors:
 99 |             cors[(prob_level, prob_type)].append(equiv)
100 |         else:
101 |             cors[(prob_level, prob_type)] = [equiv]
102 |         if prob_level in level_cors:
103 |             level_cors[prob_level].append(equiv)
104 |         else:
105 |             if prob_level is not None:
106 |                 level_cors[prob_level] = [equiv]
107 |         if prob_type in subject_cors:
108 |             subject_cors[prob_type].append(equiv)
109 |         else:
110 |             if prob_type is not None:
111 |                 subject_cors[prob_type] = [equiv]
112 |         if equiv:
113 |             correct += 1
114 |     
115 |     output_file = os.path.join(args.save_dir, "results.txt")
116 |     
117 |     output_dict = {
118 |         "outputs": [],
119 |         "accuracy_by_subject_and_level": defaultdict(list),
120 |         "accuracy_by_level": [],
121 |         "accuracy_by_subject": [],
122 |     }
123 |     print("Match rate: ", np.mean(matches))
124 |     with open(output_file, "w+") as f:
125 |         for k, (output, answer, prob_type, prob_level, match, equiv) in enumerate(zip(outputs, answers, types, levels, matches, fnames_list)):
126 |             f.write("{} TYPE: {} | LEVEL: {} | OUTPUT: {} | ANSWER: {} | MATCH: {} | CORRECT: {}\n".format(k, prob_type, prob_level, output, answer, match, equiv))
127 |             output_dict["outputs"].append({
128 |                 "type": prob_type,
129 |                 "level": prob_level,
130 |                 "output": output,
131 |                 "answer": answer,
132 |                 "match": match,
133 |                 "equiv": equiv
134 |             })
135 |         
136 | 
137 |         f.write("#####################\n")
138 |         # also get accuracies for each
139 |         for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']:
140 |             for level in range(1, 6):
141 |                 key = (level, subject)
142 |                 if key not in cors.keys():
143 |                     print("Skipping", key)
144 |                     continue
145 |                 cors_list = cors[key]
146 |                 print("{} Level {} Accuracy = {}/{} = {:.3f}".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
147 |                 f.write("{} Level {} Accuracy = {}/{} = {:.3f}\n".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
148 |                 
149 |                 output_dict["accuracy_by_subject_and_level"][subject].append({
150 |                     "level": level,
151 |                     "num_correct": np.sum(cors_list),
152 |                     "num_total": len(cors_list),
153 |                     "accuracy": np.mean(cors_list)
154 |                 })
155 | 
156 |         print("#####################")
157 |         f.write("#####################\n")
158 |         for level in sorted(level_cors):
159 |             if level not in level_cors.keys():
160 |                 print("Skipping", level)
161 |                 continue
162 |             cors_list = level_cors[level]
163 |             print("Level {} Accuracy = {}/{} = {:.3f}".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
164 |             f.write("Level {} Accuracy = {}/{} = {:.3f}\n".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
165 |             output_dict["accuracy_by_level"].append({
166 |                 "level": level,
167 |                 "num_correct": np.sum(cors_list),
168 |                 "num_total": len(cors_list),
169 |                 "accuracy": np.mean(cors_list)
170 |             })
171 | 
172 |         print("#####################")
173 |         f.write("#####################\n")
174 |         for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']:
175 |             if subject not in subject_cors.keys():
176 |                 print("Skipping", subject)
177 |                 continue
178 |             cors_list = subject_cors[subject]
179 |             print("{} Accuracy = {}/{} = {:.3f}".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
180 |             f.write("{} Accuracy = {}/{} = {:.3f}\n".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
181 |             output_dict["accuracy_by_subject"].append({
182 |                 "subject": subject,
183 |                 "num_correct": np.sum(cors_list),
184 |                 "num_total": len(cors_list),
185 |                 "accuracy": np.mean(cors_list)
186 |             })
187 |         print("#####################")
188 |         f.write("#####################\n")
189 |         total = len(all_problems)
190 |         print("Overall Accuracy = {}/{} = {:.3f}".format(correct, total, correct/total * 100))
191 |         f.write("Overall Accuracy = {}/{} = {:.3f}\n".format(correct, total, correct/total * 100))
192 |         output_dict["overall_accuracy"] = {
193 |             "num_correct": correct,
194 |             "num_total": total,
195 |             "accuracy": correct/total
196 |         }
197 |         class JSONEncoder(json.JSONEncoder):
198 |             def default(self, obj):
199 |                 if isinstance(obj, np.int64):
200 |                     return int(obj)
201 |                 return super(JSONEncoder, self).default(obj)
202 |         with open(os.path.join(args.save_dir, "results.json"), "w") as jf:
203 |             json.dump(output_dict, jf, cls=JSONEncoder)
204 | 
205 | if __name__ == "__main__":
206 | 
207 |     
208 |     run(args)
209 | 


--------------------------------------------------------------------------------
/eval/Math/math/evaluate_math_pot.py:
--------------------------------------------------------------------------------
  1 | # Adapt from https://github.com/hendrycks/math/blob/main/modeling/evaluate_gpt3.py
  2 | 
  3 | import os
  4 | import time
  5 | import traceback
  6 | import openai
  7 | import argparse
  8 | import numpy as np
  9 | import operator
 10 | import json
 11 | from tqdm import tqdm
 12 | import pandas as pd
 13 | from collections import defaultdict
 14 | from vllm import LLM, SamplingParams
 15 | import torch
 16 | import re
 17 | import math
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument("--data_dir", "-d", type=str, default="./")
 21 | parser.add_argument("--save_dir", "-s", type=str, default="./")
 22 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 23 | parser.add_argument("--model_type", type=str, default='mistral')
 24 | args = parser.parse_args()
 25 | 
 26 | 
 27 | import sys
 28 | sys.path.append("../..")
 29 | from utils import evaluate_math
 30 | from utils.python_interpreter import postprocess_completions
 31 | 
 32 | os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
 33 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 34 | 
 35 | def generate_sample_batch(question_list):
 36 |     llm = LLM(
 37 |         model=args.model,
 38 |         trust_remote_code=True,
 39 |         tensor_parallel_size=torch.cuda.device_count(),
 40 |     )
 41 |     sampling_params = SamplingParams(max_tokens=2048,
 42 |                                     n=1,
 43 |                                     temperature=0,
 44 |                                     stop=["\n###\nProblem: "],)
 45 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 46 |     completions = [output.outputs[0].text.strip() for output in outputs]
 47 |     completions = postprocess_completions(completions)
 48 |     return completions
 49 | 
 50 | 
 51 | from transformers import AutoTokenizer
 52 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 53 | def make_conv(question):
 54 |     prompt = "Tool available:\n[1] Python interpreter\nWhen you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. Present code in ```python```\n"
 55 |     prompt += "Solve the following math problem step-by-step.\nSimplify your answer as much as possible.\n"
 56 |     # prompt = ""
 57 |     prompt += question
 58 |     # add question
 59 |     # msg =  [{"role": "user", "content": prompt},{"role": "assistant", "content": "Let's think step by step."}]
 60 |     msg =  [{"role": "user", "content": prompt}]
 61 | 
 62 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 63 |     return out
 64 | 
 65 |     
 66 | 
 67 | def run(args, max=-1):
 68 |     outputs = []
 69 |     answers = []
 70 |     types = []
 71 |     levels = []
 72 | 
 73 |     fnames_list = []
 74 | 
 75 |     cors = {}
 76 |     subject_cors = {}
 77 |     level_cors = {}
 78 |     correct = 0
 79 |     total = 0
 80 | 
 81 |     
 82 |     all_problems = pd.read_json(os.path.join(args.data_dir, "math_test_cleaned.json")).to_dict(orient="records")
 83 |     completions = generate_sample_batch([make_conv(problem_data["problem"]) for problem_data in all_problems])
 84 | 
 85 | 
 86 |     for problem_data, model_output in tqdm(zip(all_problems, completions), total=len(all_problems), desc="Matching"):
 87 |         prob_level = problem_data["level"]
 88 |         prob_type = problem_data["type"]
 89 |         try:
 90 |             prob_level = int(prob_level.split("Level ")[1])
 91 |         except:
 92 |             prob_level = None
 93 | 
 94 |         answer = problem_data["expected_answer"]
 95 |         is_matched, equiv, model_output = evaluate_math(model_output, answer)
 96 |         levels.append(prob_level)
 97 |         types.append(prob_type)
 98 |         outputs.append(model_output)
 99 |         answers.append(answer)
100 | 
101 |         fnames_list.append(equiv)
102 |         if (prob_level, prob_type) in cors:
103 |             cors[(prob_level, prob_type)].append(equiv)
104 |         else:
105 |             cors[(prob_level, prob_type)] = [equiv]
106 |         if prob_level in level_cors:
107 |             level_cors[prob_level].append(equiv)
108 |         else:
109 |             if prob_level is not None:
110 |                 level_cors[prob_level] = [equiv]
111 |         if prob_type in subject_cors:
112 |             subject_cors[prob_type].append(equiv)
113 |         else:
114 |             if prob_type is not None:
115 |                 subject_cors[prob_type] = [equiv]
116 |         if equiv:
117 |             correct += 1
118 |     
119 |     output_file = os.path.join(args.save_dir, "results.txt")
120 |     
121 |     output_dict = {
122 |         "outputs": [],
123 |         "accuracy_by_subject_and_level": defaultdict(list),
124 |         "accuracy_by_level": [],
125 |         "accuracy_by_subject": [],
126 |     }
127 |     with open(output_file, "w+") as f:
128 |         for k, (output, answer, prob_type, prob_level, equiv) in enumerate(zip(outputs, answers, types, levels, fnames_list)):
129 |             f.write("{} TYPE: {} | LEVEL: {} | OUTPUT: {} | ANSWER: {} | CORRECT: {}\n".format(k, prob_type, prob_level, output, answer, equiv))
130 |             output_dict["outputs"].append({
131 |                 "type": prob_type,
132 |                 "level": prob_level,
133 |                 "output": output,
134 |                 "answer": answer,
135 |                 "equiv": equiv
136 |             })
137 | 
138 |         f.write("#####################\n")
139 |         # also get accuracies for each
140 |         for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']:
141 |             for level in range(1, 6):
142 |                 key = (level, subject)
143 |                 if key not in cors.keys():
144 |                     print("Skipping", key)
145 |                     continue
146 |                 cors_list = cors[key]
147 |                 print("{} Level {} Accuracy = {}/{} = {:.3f}".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
148 |                 f.write("{} Level {} Accuracy = {}/{} = {:.3f}\n".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
149 |                 
150 |                 output_dict["accuracy_by_subject_and_level"][subject].append({
151 |                     "level": level,
152 |                     "num_correct": np.sum(cors_list),
153 |                     "num_total": len(cors_list),
154 |                     "accuracy": np.mean(cors_list)
155 |                 })
156 | 
157 |         print("#####################")
158 |         f.write("#####################\n")
159 |         for level in sorted(level_cors):
160 |             if level not in level_cors.keys():
161 |                 print("Skipping", level)
162 |                 continue
163 |             cors_list = level_cors[level]
164 |             print("Level {} Accuracy = {}/{} = {:.3f}".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
165 |             f.write("Level {} Accuracy = {}/{} = {:.3f}\n".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
166 |             output_dict["accuracy_by_level"].append({
167 |                 "level": level,
168 |                 "num_correct": np.sum(cors_list),
169 |                 "num_total": len(cors_list),
170 |                 "accuracy": np.mean(cors_list)
171 |             })
172 | 
173 |         print("#####################")
174 |         f.write("#####################\n")
175 |         for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']:
176 |             if subject not in subject_cors.keys():
177 |                 print("Skipping", subject)
178 |                 continue
179 |             cors_list = subject_cors[subject]
180 |             print("{} Accuracy = {}/{} = {:.3f}".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
181 |             f.write("{} Accuracy = {}/{} = {:.3f}\n".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
182 |             output_dict["accuracy_by_subject"].append({
183 |                 "subject": subject,
184 |                 "num_correct": np.sum(cors_list),
185 |                 "num_total": len(cors_list),
186 |                 "accuracy": np.mean(cors_list)
187 |             })
188 |         print("#####################")
189 |         f.write("#####################\n")
190 |         total = len(all_problems)
191 |         print("Overall Accuracy = {}/{} = {:.3f}".format(correct, total, correct/total * 100))
192 |         f.write("Overall Accuracy = {}/{} = {:.3f}\n".format(correct, total, correct/total * 100))
193 |         output_dict["overall_accuracy"] = {
194 |             "num_correct": correct,
195 |             "num_total": total,
196 |             "accuracy": correct/total
197 |         }
198 |         class JSONEncoder(json.JSONEncoder):
199 |             def default(self, obj):
200 |                 if isinstance(obj, np.int64):
201 |                     return int(obj)
202 |                 return super(JSONEncoder, self).default(obj)
203 |         with open(os.path.join(args.save_dir, "results.json"), "w") as jf:
204 |             json.dump(output_dict, jf, cls=JSONEncoder)
205 | 
206 | if __name__ == "__main__":
207 | 
208 | 
209 |     run(args)
210 | 


--------------------------------------------------------------------------------
/eval/Math/theorem_qa/evaluate_theorem_qa_cot.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Any
  2 | import os
  3 | import json
  4 | from tqdm import tqdm
  5 | from datetime import datetime
  6 | import openai
  7 | from time import sleep
  8 | import argparse
  9 | from util import *
 10 | from datasets import Dataset
 11 | import pandas as pd
 12 | from vllm import LLM, SamplingParams
 13 | import torch
 14 | import time
 15 | 
 16 | 
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 20 | parser.add_argument("--input_data", type=str, default="./theorem_qa.json")
 21 | parser.add_argument("--save_dir", type=str, default="./")
 22 | parser.add_argument("--model_type", type=str, default='mistral')
 23 | 
 24 | args = parser.parse_args()
 25 | 
 26 | 
 27 | def generate_sample_batch(question_list):
 28 |     llm = LLM(
 29 |         model=args.model,
 30 |         trust_remote_code=True,
 31 |         tensor_parallel_size=torch.cuda.device_count(),
 32 |     )
 33 |     sampling_params = SamplingParams(max_tokens=1024,
 34 |                                     temperature=0.0,
 35 |                                     top_p=1,
 36 |                                     n=1,
 37 |                                     stop=["Question:"],)
 38 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 39 |     completions = [output.outputs[0].text for output in outputs]
 40 |     return completions
 41 | 
 42 | 
 43 | def create_reader_request(example: Dict[str, Any]) -> str:
 44 |     string = f"Question: {example['Question']}"
 45 |     return string
 46 | 
 47 | 
 48 | from transformers import AutoTokenizer
 49 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 50 | def make_conv(question, model_type):
 51 |     prompt = "Solve the following math problem step-by-step.\n" + "Simplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}.\n" + question
 52 |     # add question
 53 |     msg =  [{"role": "user", "content": prompt},]
 54 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 55 |     return out
 56 | 
 57 | 
 58 | if __name__ == "__main__":
 59 | 
 60 |     
 61 |     test_set = pd.read_json(args.input_data)
 62 |     test_set["prompt"] = test_set.apply(lambda row: make_conv(create_reader_request(row),args.model_type), axis=1)
 63 |     completions = generate_sample_batch(test_set["prompt"].tolist())
 64 |     test_set["completion"] = completions
 65 |     test_set.to_json(os.path.join(args.save_dir, "theorem_qa_completion.json"))
 66 |     
 67 |     answered_set = dict()
 68 |     correct, wrong = 0, 0
 69 | 
 70 |     output_filename = os.path.join(args.save_dir, "theorem_qa_output.json")
 71 |     writer = open(output_filename, 'w')
 72 |     accuracy = []
 73 | 
 74 |     for example in test_set.to_dict(orient="records"):
 75 |         
 76 |         if answered_set and example['id'] in answered_set:
 77 |             writer.write(answered_set[example['id']] + '\n')
 78 |             continue
 79 | 
 80 |         result = example["completion"]
 81 |         _, prediction = match_answer(result)
 82 |         prediction = postprocess_number(prediction)
 83 | 
 84 |         verifier = TheoremqaTask(id=example["id"], 
 85 |                                 prompt=example["Question"], 
 86 |                                 reference=example["Answer"], 
 87 |                                 answer_type=example["Answer_type"])
 88 |         acc = verifier.success(prediction)
 89 |         tmp = {
 90 |             'id': example['id'],
 91 |             'question': example['Question'],
 92 |             'prediction': prediction,
 93 |             'answer': example['Answer'],
 94 |             'rationale': result,
 95 |             'answer_type': example['Answer_type'],
 96 |             "is_correct": acc,
 97 |             }
 98 |         writer.write(json.dumps(tmp) + '\n')
 99 |         
100 |         
101 |         accuracy.append(acc)
102 | 
103 |     writer.close()
104 |     print()
105 | 
106 |     accuracy = sum([1 if acc else 0 for acc in accuracy]) / len(accuracy)
107 |     with open(os.path.join(args.save_dir, "result.txt"), "w") as f:
108 |         print({"accuracy": accuracy * 100})
109 |         print({"accuracy": accuracy * 100}, file=f)
110 | 
111 | 


--------------------------------------------------------------------------------
/eval/Math/theorem_qa/evaluate_theorem_qa_pot.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Any
  2 | import os
  3 | import json
  4 | from tqdm import tqdm
  5 | from datetime import datetime
  6 | import openai
  7 | from time import sleep
  8 | import argparse
  9 | from util import *
 10 | from datasets import Dataset
 11 | import pandas as pd
 12 | from vllm import LLM, SamplingParams
 13 | import torch
 14 | import time
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 18 | parser.add_argument("--input_data", type=str, default="./theorem_qa.json")
 19 | parser.add_argument("--save_dir", type=str, default="./")
 20 | parser.add_argument("--model_type", type=str, default='mistral')
 21 | args = parser.parse_args()
 22 | 
 23 | 
 24 | import sys
 25 | sys.path.append("../..")
 26 | from utils.python_interpreter import postprocess_completions
 27 | 
 28 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 29 | 
 30 | 
 31 | def generate_sample_batch(question_list):
 32 |     llm = LLM(
 33 |         model=args.model,
 34 |         trust_remote_code=True,
 35 |         tensor_parallel_size=torch.cuda.device_count(),
 36 |     )
 37 |     sampling_params = SamplingParams(max_tokens=1024,
 38 |                                     temperature=0.0,
 39 |                                     top_p=1,
 40 |                                     n=1,
 41 |                                     stop=["Question:"],)
 42 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 43 |     outputs = [output.outputs[0].text.strip() for output in outputs]
 44 |     completions = postprocess_completions(outputs)
 45 |     return outputs, completions
 46 | 
 47 | 
 48 | def create_reader_request(example: Dict[str, Any]) -> str:
 49 |     string = f"Question: {example['Question']}"
 50 |     return string
 51 | 
 52 | 
 53 | from transformers import AutoTokenizer
 54 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 55 | def make_conv(question):
 56 |     prompt = "Tool available:\n[1] Python interpreter\nWhen you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment.\n"
 57 |     prompt += """Solve the following math problem step-by-step.\nSimplify your answer as much as possible. The answer can only be one of the following forms:
 58 | 1. a numerical value like 0.1, no symbol at all.
 59 | 2. a list of number like [2, 3, 4].
 60 | 3. True/False.
 61 | 4. an option like (a), (b), (c), (d)\n
 62 | """
 63 |     # add question
 64 |     msg =  [{"role": "user", "content": prompt + question},]
 65 |     out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
 66 |     return out
 67 | 
 68 | if __name__ == "__main__":
 69 | 
 70 |     test_set = pd.read_json(args.input_data)
 71 |     test_set["prompt"] = test_set.apply(lambda row: make_conv(create_reader_request(row)), axis=1)
 72 |     outputs, completions = generate_sample_batch(test_set["prompt"].tolist())
 73 |     test_set["output"] = outputs
 74 |     test_set["completion"] = completions
 75 |     test_set.to_json(os.path.join(args.save_dir, "theorem_qa_completion.json"))
 76 |     
 77 |     answered_set = dict()
 78 |     correct, wrong = 0, 0
 79 | 
 80 |     output_filename = os.path.join(args.save_dir, "theorem_qa_output.json")
 81 |     writer = open(output_filename, 'w')
 82 |     accuracy = []
 83 | 
 84 |     for example in test_set.to_dict(orient="records"):
 85 |         
 86 |         if answered_set and example['id'] in answered_set:
 87 |             writer.write(answered_set[example['id']] + '\n')
 88 |             continue
 89 | 
 90 |         result = example["completion"]
 91 |         prediction = extract_answer(result)
 92 |         prediction = postprocess_number(prediction)
 93 | 
 94 |         # print(result)
 95 |         # print(prediction, ' $$$$$$$$$ ', example['Answer'])
 96 |         if "exit()" in prediction:
 97 |             acc = False
 98 |         else:
 99 |             verifier = TheoremqaTask(id=example["id"], 
100 |                                     prompt=example["Question"], 
101 |                                     reference=example["Answer"], 
102 |                                     answer_type=example["Answer_type"])
103 |             acc = verifier.success(prediction)
104 |         tmp = {
105 |             'id': example['id'],
106 |             'question': example['Question'],
107 |             'prediction': prediction,
108 |             'answer': example['Answer'],
109 |             'output': example["output"],
110 |             'completion': example['completion'],
111 |             'answer_type': example['Answer_type'],
112 |             "is_correct": acc,
113 |             }
114 |         writer.write(json.dumps(tmp) + '\n')
115 |         
116 |         
117 |         accuracy.append(acc)
118 | 
119 |     writer.close()
120 |     print()
121 | 
122 |     accuracy = sum([1 if acc else 0 for acc in accuracy]) / len(accuracy)
123 |     with open(os.path.join(args.save_dir, "result.txt"), "w") as f:
124 |         print({"accuracy": accuracy * 100})
125 |         print({"accuracy": accuracy * 100}, file=f)
126 | 
127 | 


--------------------------------------------------------------------------------
/eval/README.md:
--------------------------------------------------------------------------------
  1 | # EVAL
  2 | 
  3 | ## Coding
  4 | 
  5 | ### human_eval
  6 | 
  7 | ```bash
  8 | cd Coding/human_eval
  9 | python evaluate_human_eval_chat_quicktest.py \
 10 |   --model ../../../models/eurus-7b-kto \
 11 |   --save_dir ./ \
 12 |   --num-samples-per-task 1 \
 13 |   --model_type mistral \
 14 |   --temperature 0.2
 15 | ```
 16 | 
 17 | ### leetcode
 18 | 
 19 | ```bash
 20 | cd Coding/leetcode
 21 | python evaluate_leetcode_chat_quicktest.py \
 22 |   --model ../../../models/eurus-7b-kto \
 23 |   --save_dir ./ \
 24 |   --num-samples-per-task 1 \
 25 |   --model_type mistral \
 26 |   --temperature 0.
 27 | ```
 28 | 
 29 | ### mbpp
 30 | 
 31 | ```bash
 32 | cd Coding/mbpp
 33 | python run_mbpp_chat_quicktest.py \
 34 |   --model ../../../models/eurus-7b-kto \
 35 |   --input_data 	new_mbpp.json \
 36 |   --save_dir ./ \
 37 |   --model_type mistral 
 38 | ```
 39 | 
 40 | ## Math
 41 | 
 42 | ### math
 43 | 
 44 | ```bash
 45 | cd Math/math
 46 | python evaluate_math_chat_quicktest.py \
 47 |   --data_dir ./ \
 48 |   --save_dir ./ \
 49 |   --model_type mistral \
 50 |   --model ../../../models/eurus-7b-kto 
 51 | ```
 52 | 
 53 | 
 54 | 
 55 | ```bash
 56 | cd Math/math
 57 | python evaluate_math_ui_quicktest.py \
 58 |   --data_dir ./ \
 59 |   --save_dir ./ \
 60 |   --model_type mistral \
 61 |   --model ../../../models/eurus-7b-kto 
 62 | ```
 63 | 
 64 | ### theorem_qa
 65 | 
 66 | ```bash
 67 | cd Math/theorem_qa
 68 | python theorem_qa_ui_quicktest.py \
 69 |   --model ../../../models/eurus-7b-kto \
 70 |   --input_data 	./theorem_qa.json \
 71 |   --model_type mistral \
 72 |   --save_dir ./
 73 | ```
 74 | 
 75 | 
 76 | 
 77 | ```bash
 78 | cd Math/theorem_qa
 79 | python theorem_qa_chat_quicktest.py \
 80 |   --model ../../../models/eurus-7b-kto \
 81 |   --input_data 	./theorem_qa.json \
 82 |   --model_type mistral \
 83 |   --save_dir ./
 84 | ```
 85 | 
 86 | ### SVAMP&ASDiv&GSM-Plus
 87 | 
 88 | ```bash
 89 | cd Math/subset
 90 | python subset.py \
 91 |   --data_dir ./data \
 92 |   --save_dir ./result \
 93 |   --model_type mistral \
 94 |   --model ../../../models/eurus-7b-kto 
 95 | ```
 96 | 
 97 | 
 98 | 
 99 | ```bash
100 | cd Math/subset
101 | python subset_ui_quicktest.py \
102 |   --data_dir ./data \
103 |   --save_dir ./ui_results \
104 |   --model_type mistral \
105 |   --model ../../../models/eurus-7b-kto 
106 | ```
107 | 
108 | 
109 | 
110 | ## Reasoning
111 | 
112 | ### BBH
113 | 
114 | ```bash
115 | cd Reasoning/bbh
116 | python run_bbh_chat_quicktest.py \
117 |   --model ../../../models/eurus-7b-kto \
118 |   --data_filepath ./test_prompts.json \
119 |   --output_filepath ./res.jsonl \
120 |   --model_type mistral \
121 |   --n_processes 8
122 | ```
123 | 
124 | 
125 | 
126 | ## Ins-Following
127 | 
128 | ### if_eval
129 | 
130 | ```bash
131 | cd Ins-Following/if_eval
132 | python generate_response_chat_quicktest.py \
133 |   --model ../../../models/eurus-7b-kto \
134 |   --input_data 	./input_data.jsonl \
135 |   --save_path ./input_response_data.jsonl \
136 |   --model_type mistral 
137 | ```
138 | 
139 | 


--------------------------------------------------------------------------------
/eval/Reasoning/bbh/evaluate_bbh.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import time
  4 | import traceback
  5 | import openai
  6 | import pandas as pd
  7 | import argparse
  8 | from tqdm import tqdm
  9 | from transformers import AutoTokenizer
 10 | 
 11 | tqdm.pandas()
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
 15 | parser.add_argument('--data_filepath', type=str, default="test_prompts.json")
 16 | parser.add_argument('--output_filepath', type=str, default="./res.jsonl")
 17 | parser.add_argument("--model_type", type=str, default='mistral')
 18 | parser.add_argument('--is_cot', action='store_true')
 19 | parser.add_argument('--n_processes', type=int, default=8)
 20 | args = parser.parse_args()
 21 | 
 22 | 
 23 | assert args.data_filepath.endswith('.json')
 24 | 
 25 | df = pd.read_json(args.data_filepath, lines=True, orient='records')
 26 | print(f"Loaded {len(df)} examples.")
 27 | 
 28 | from vllm import LLM, SamplingParams
 29 | import torch
 30 | def generate_sample_batch(question_list):
 31 |     llm = LLM(
 32 |         model=args.model,
 33 |         trust_remote_code=True,
 34 |         tensor_parallel_size=torch.cuda.device_count()
 35 |     )
 36 |     sampling_params = SamplingParams(max_tokens=2048,
 37 |                                     temperature=0.0,
 38 |                                     n=1,
 39 |                                     stop=["\nQ:"],)
 40 |     
 41 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 42 |     completions = [output.outputs[0].text.strip() for output in outputs]
 43 |     return completions
 44 | 
 45 | from transformers import AutoTokenizer
 46 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 47 | def make_conv_hf(prompt, tokenizer):
 48 |     
 49 |     general_instruction = "Follow the given examples and answer the question.\n\n"
 50 |     prompt_list = prompt.split("\n\nQ: ") # [task_instruction, (Q\nA), (Q\nA)]
 51 |     task_instruction = prompt_list.pop(0)
 52 | 
 53 |     assert all("A: Let's think step by step." in p for p in prompt_list), (prompt, prompt_list)
 54 | 
 55 |     msg = []
 56 |     prompt_list = [p.split("A: Let's think step by step.") for p in prompt_list]
 57 |     for sample in prompt_list:
 58 |         assert len(sample) == 2, (sample, len(sample))
 59 |         q = "Q: " + sample[0]
 60 |         a = "A: Let's think step by step." + sample[-1]
 61 |         msg.append({"role": "user", "content": q.strip()})
 62 |         msg.append({"role": "assistant", "content": a.strip()})
 63 |     msg[0]["content"] = general_instruction + task_instruction + "\n" + msg[0]["content"]
 64 |     # chat = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
 65 |     chat = tokenizer.apply_chat_template(msg[:-1], tokenize=False, add_generation_prompt=True)
 66 |     # chat = chat.lstrip(tokenizer.bos_token).strip()
 67 |     if "eurus-70b" not in args.model.lower():
 68 |         chat = chat.rstrip(tokenizer.eos_token).strip()
 69 |     return chat
 70 | 
 71 | 
 72 | df["prompt"] = df.apply(lambda row: make_conv_hf(row["text"], tokenizer), axis=1)
 73 | df["generation"] = generate_sample_batch(df["prompt"])
 74 | 
 75 | 
 76 | if args.is_cot:
 77 |     def check_cot_match(generation, reference) -> bool:
 78 |         generation = generation.lstrip().split("Q:")[0].strip()
 79 |         reference = reference.strip()
 80 |         return reference in generation
 81 |     df["match"] = df.apply(lambda row: check_cot_match(row["generation"], row["reference"]), axis=1)
 82 | else:
 83 |     def check_match(generation, reference) -> bool:
 84 |         generation = generation.lstrip()
 85 |         reference = reference.lstrip()
 86 |         return generation.startswith(reference)
 87 |     df["match"] = df.apply(lambda row: check_match(row["generation"], row["reference"]), axis=1)
 88 | 
 89 | exact_match_by_task = df.groupby("task_name")["match"].mean()
 90 | exact_match = df["match"].mean() * 100
 91 | 
 92 | df.to_json(args.output_filepath + ".outputs.jsonl", lines=True, orient='records')
 93 | 
 94 | with open(args.output_filepath, "w") as f:
 95 |     f.write(json.dumps({
 96 |         "exact_match": exact_match,
 97 |         "exact_match_by_task": exact_match_by_task.to_dict()
 98 |     }))
 99 | 
100 | print("Exact match: ", exact_match)
101 | print("Exact match by task: ", exact_match_by_task)
102 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/abstract_algebra_dev.csv:
--------------------------------------------------------------------------------
1 | Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.,0,1,2,3,B
2 | "Statement 1 | If aH is an element of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G.","True, True","False, False","True, False","False, True",B
3 | Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.,"True, True","False, False","True, False","False, True",C
4 | Statement 1| Every function from a finite set onto itself must be one to one. Statement 2 | Every subgroup of an abelian group is abelian.,"True, True","False, False","True, False","False, True",A
5 | Find the characteristic of the ring 2Z.,0,3,12,30,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/anatomy_dev.csv:
--------------------------------------------------------------------------------
1 | What is the embryological origin of the hyoid bone?,The first pharyngeal arch,The first and second pharyngeal arches,The second pharyngeal arch,The second and third pharyngeal arches,D
2 | Which of these branches of the trigeminal nerve contain somatic motor processes?,The supraorbital nerve,The infraorbital nerve,The mental nerve,None of the above,D
3 | The pleura,have no sensory innervation.,are separated by a 2 mm space.,extend into the neck.,are composed of respiratory epithelium.,C
4 | In Angle's Class II Div 2 occlusion there is,excess overbite of the upper lateral incisors.,negative overjet of the upper central incisors.,excess overjet of the upper lateral incisors.,excess overjet of the upper central incisors.,C
5 | Which of the following is the body cavity that contains the pituitary gland?,Abdominal,Cranial,Pleural,Spinal,B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/astronomy_dev.csv:
--------------------------------------------------------------------------------
1 | You are pushing a truck along a road. Would it be easier to accelerate this truck on Mars? Why? (Assume there is no friction),It would be harder since the truck is heavier on Mars.,It would be easier since the truck is lighter on Mars.,It would be harder since the truck is lighter on Mars.,It would be the same no matter where you are.,D
2 | Where do most short-period comets come from and how do we know?,The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt.,The Kuiper belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the Kuiper belt.,The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt.,The Oort cloud; short period comets tend to be in the plane of the solar system just like the Oort cloud.,A
3 | Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?,10000 times more,100 times more,1000 times more,10 times more,A
4 | Why isn't there a planet where the asteroid belt is located?,A planet once formed here but it was broken apart by a catastrophic collision.,There was not enough material in this part of the solar nebula to form a planet.,There was too much rocky material to form a terrestrial planet but not enough gaseous material to form a jovian planet.,Resonance with Jupiter prevented material from collecting together to form a planet.,D
5 | Why is Mars red?,"Because the surface is covered with heavily oxidized (""rusted"") minerals.",Because the atmosphere scatters more light at bluer wavelengths transmitting mostly red light.,Because Mars is covered with ancient lava flows which are red in color.,Because flowing water on Mars's surface altered the surface minerals several billion years ago.,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/business_ethics_dev.csv:
--------------------------------------------------------------------------------
1 | "Beyond the business case for engaging in CSR there are a number of moral arguments relating to: negative _______, the _______that corporations possess and the ________ of business and society.","Externalities, Power, Independence","Publicity, Insubstantial resources, Mutual dependence","Publicity, Power, Independence","Externalities, Power, Mutual dependence",D
2 | "_______ is the direct attempt to formally or informally manage ethical issues or problems, through specific policies, practices and programmes.",Corporate social responsibility,Business ethics management,Sustainability,Environmental management,B
3 | "To ensure the independence of the non-executive board members, they are a number of steps which can be taken, which include non-executives being drawn from _______ the company, being appointed for a _________ time period as well as being appointed _________.","Outside, Limited, Independently","Inside, Limited, Intermittently","Outside, Unlimited, Intermittently","Inside, Unlimited, Independently",A
4 | "Three contrasting tactics that CSO's can engage in to meet their aims are ________ which typically involves research and communication, ________, which may involve physically attacking a company's operations or ________, often involving some form of _______.","Non-violent direct action, Violent direct action, Indirect action, Boycott","Indirect action, Instrumental action, Non-violent direct action, Information campaign","Indirect action, Violent direct action, Non-violent direct-action Boycott","Non-violent direct action, Instrumental action, Indirect action, Information campaign",C
5 | "In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .","Buycotts, Boycotts, Blockchain technology, Charitable donations","Buycotts, Boycotts, Digital technology, Increased Sales","Boycotts, Buyalls, Blockchain technology, Charitable donations","Boycotts, Buycotts, Digital technology, Increased Sales",D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/clinical_knowledge_dev.csv:
--------------------------------------------------------------------------------
1 | The energy for all forms of muscle contraction is provided by:,ATP.,ADP.,phosphocreatine.,oxidative phosphorylation.,A
2 | What is the difference between a male and a female catheter?,Male and female catheters are different colours.,Male catheters are longer than female catheters.,Male catheters are bigger than female catheters.,Female catheters are longer than male catheters.,B
3 | In the assessment of the hand function which of the following is true?,Abduction of the thumb is supplied by spinal root T2,Opposition of the thumb by opponens policis is supplied by spinal root T1,Finger adduction is supplied by the median nerve,Finger abduction is mediated by the palmar interossei,B
4 | "How many attempts should you make to cannulate a patient before passing the job on to a senior colleague, according to the medical knowledge of 2020?",4,3,2,1,C
5 | Glycolysis is the name given to the pathway involving the conversion of:,glycogen to glucose-1-phosphate.,glycogen or glucose to fructose.,glycogen or glucose to pyruvate or lactate.,glycogen or glucose to pyruvate or acetyl CoA.,C


--------------------------------------------------------------------------------
/eval/mmlu/dev/college_biology_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following represents an accurate statement concerning arthropods?,They possess an exoskeleton composed primarily of peptidoglycan.,They possess an open circulatory system with a dorsal heart.,They are members of a biologically unsuccessful phylum incapable of exploiting diverse habitats and nutrition sources.,"They lack paired, jointed appendages.",B
2 | "In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?",1/400,19/400,20/400,38/400,D
3 | "The presence of homologous structures in two different organisms, such as the humerus in the front limb of a human and a bird, indicates that",the human and bird are polyphyletic species,a human's and bird's evolution is convergent,the human and bird belong to a clade,the human and bird developed by analogy,C
4 | "According to the pressure-flow model of movement of phloem contents, photosynthate movement from source to sink is driven by",an ATP-dependent pressure-flow pump,a water-pressure potential gradient,transpiration,apoplastic diffusion,B
5 | Which of the following contain DNA sequences required for the segregation of chromosomes in mitosis and meiosis?,Telomeres,Centromeres,Nucleosomes,Spliceosomes,B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/college_chemistry_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following statements about the lanthanide elements is NOT true?,The most common oxidation state for the lanthanide elements is +3.,Lanthanide complexes often have high coordination numbers (> 6).,All of the lanthanide elements react with aqueous acid to liberate hydrogen.,The atomic radii of the lanthanide elements increase across the period from La to Lu.,D
2 | A 0.217 g sample of HgO (molar mass = 217 g) reacts with excess iodide ions according to the reaction shown above. Titration of the resulting solution requires how many mL of 0.10 M HCl to reach equivalence point?,1.0 mL,10 mL,20 mL,50 mL,C
3 | "Predict the number of lines in the EPR spectrum of a solution of 13C-labelled methyl radical (13CH3•), assuming the lines do not overlap.",4,3,6,24,A
4 | "3 Cl−(aq) + 4 CrO_4^2−(aq) + 23 H+(aq) → 3 HClO2(aq) + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl−(aq) behaves as",an acid,a base,a catalyst,a reducing agent,D
5 | "Which of the following lists the hydrides of group-14 elements in order of thermal stability, from lowest to highest?",PbH4 < SnH4 < GeH4 < SiH4 < CH4,PbH4 < SnH4 < CH4 < GeH4 < SiH4,CH4 < SiH4 < GeH4 < SnH4 < PbH4,CH4 < PbH4 < GeH4 < SnH4 < SiH4,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/college_computer_science_dev.csv:
--------------------------------------------------------------------------------
 1 | Which of the following regular expressions is equivalent to (describes the same set of strings as) (a* + b)*(c + d)?,a*(c + d)+ b(c + d),a*(c + d)* + b(c + d)*,a*(c + d)+ b*(c + d),(a + b)*c +(a + b)*d,D
 2 | "A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . . . , R7 and supports the following operations.
 3 | ADD Rs1, Rs2, Rd Add Rs1 to Rs2 and put the sum in Rd
 4 | MUL Rs1, Rs2, Rd Multiply Rs1 by Rs2 and put the product in Rd
 5 | An operation normally takes one cycle; however, an operation takes two cycles if it produces a result required by the immediately following operation in an operation sequence. Consider the expression AB + ABC + BC, where variables A, B, C are located in registers R0, R1, R2. If the contents of these three registers must not be modified, what is the minimum number of clock cycles required for an operation sequence that computes the value of AB + ABC + BC?",5,6,7,8,B
 6 | "The Singleton design pattern is used to guarantee that only a single instance of a class may be instantiated. Which of the following is (are) true of this design pattern?
 7 | I. The Singleton class has a static factory method to provide its instance.
 8 | II. The Singleton class can be a subclass of another class.
 9 | III. The Singleton class has a private constructor.",I only,II only,III only,"I, II, and III",D
10 | "A compiler generates code for the following assignment statement.
11 | G := (A + B) * C - (D + E) * F
12 | The target machine has a single accumulator and a single-address instruction set consisting of instructions load, store, add, subtract, and multiply. For the arithmetic operations, the left operand is taken from the accumulator and the result appears in the accumulator. The smallest possible number of instructions in the resulting code is",5,6,7,9,D
13 | "Consider a computer design in which multiple processors, each with a private cache memory, share global memory using a single bus. This bus is the critical system resource. Each processor can execute one instruction every 500 nanoseconds as long as memory references are satisfied by its local cache. When a cache miss occurs, the processor is delayed for an additional 2,000 nanoseconds. During half of this additional delay, the bus is dedicated to serving the cache miss. During the other half, the processor cannot continue, but the bus is free to service requests from other processors. On average, each instruction requires 2 memory references. On average, cache misses occur on 1 percent of references. What proportion of the capacity of the bus would a single processor consume, ignoring delays due to competition from other processors?",1/50,1/27,1/25,2/27,B
14 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/college_mathematics_dev.csv:
--------------------------------------------------------------------------------
1 | "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?",ST = 0,ST = T,ST = TS,ST - TS is the identity map of V onto itself.,D
2 | "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?",2,2 - e^-2,2 + e^-2,2 + e^-4,D
3 | "Let A be a real 2x2 matrix. Which of the following statements must be true?
4 | I. All of the entries of A^2 are nonnegative.
5 | II. The determinant of A^2 is nonnegative.
6 | III. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.",I only,II only,III only,II and III only,B
7 | "Suppose that f(1 + x) = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)",-11,0,11,33/2,C
8 | "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) \in A}?",-5,-4,-3,-2,B
9 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/college_medicine_dev.csv:
--------------------------------------------------------------------------------
1 | Glucose is transported into the muscle cell:,via protein transporters called GLUT4.,only in the presence of insulin.,via hexokinase.,via monocarbylic acid transporters.,A
2 | Which of the following is not a true statement?,Muscle glycogen is broken down enzymatically to glucose-1-phosphate,Elite endurance runners have a high proportion of Type I fibres in their leg muscles,Liver glycogen is important in the maintenance of the blood glucose concentration,Insulin promotes glucose uptake by all tissues in the body,D
3 | "In a genetic test of a newborn, a rare genetic disorder is found that has X-linked recessive transmission. Which of the following statements is likely true regarding the pedigree of this disorder?",All descendants on the maternal side will have the disorder.,Females will be approximately twice as affected as males in this family.,All daughters of an affected male will be affected.,There will be equal distribution of males and females affected.,C
4 | "A high school science teacher fills a 1 liter bottle with pure nitrogen and seals the lid. The pressure is 1.70 atm, and the room temperature is 25°C. Which two variables will both increase the pressure of the system, if all other variables are held constant?","Increasing temperature, increasing moles of gas","Increasing temperature, increasing volume","Decreasing volume, decreasing temperature","Decreasing moles of gas, increasing volume",A
5 | An expected side effect of creatine supplementation is:,muscle weakness.,gain in body mass.,muscle cramps.,loss of electrolytes.,B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/college_physics_dev.csv:
--------------------------------------------------------------------------------
1 | A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is,4,5,6,20,A
2 | For which of the following thermodynamic processes is the increase in the internal energy of an ideal gas equal to the heat added to the gas?,Constant temperature,Constant volume,Constant pressure,Adiabatic,B
3 | "One end of a Nichrome wire of length 2L and cross-sectional area A is attached to an end of another Nichrome wire of length L and cross- sectional area 2A. If the free end of the longer wire is at an electric potential of 8.0 volts, and the free end of the shorter wire is at an electric potential of 1.0 volt, the potential at the junction of the two wires is most nearly equal to",2.4 V,3.3 V,4.5 V,5.7 V,A
4 | A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is,4,5,6,20,A
5 | "The muon decays with a characteristic lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from decaying into an electron and just a single neutrino by the law of conservation of",charge,mass,energy and momentum,lepton number,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/computer_security_dev.csv:
--------------------------------------------------------------------------------
1 | SHA-1 has a message digest of,160 bits,512 bits,628 bits,820 bits,A
2 | "_____________ can modify data on your system – so that your system doesn’t run correctly or you can no longer access specific data, or it may even ask for ransom in order to give your access.",IM – Trojans,Backdoor Trojans,Trojan-Downloader,Ransom Trojan,D
3 | What is ethical hacking?,"""Hacking"" ethics so they justify unintended selfish behavior","Hacking systems (e.g., during penetration testing) to expose vulnerabilities so they can be fixed, rather than exploited",Hacking into systems run by those whose ethics you disagree with,"A slang term for rapid software development, e.g., as part of hackathons",B
4 | Exploitation of the Heartbleed bug permits,overwriting cryptographic keys in memory,a kind of code injection,a read outside bounds of a buffer,a format string attack,C
5 | The ____________ is anything which your search engine cannot search.,Haunted web,World Wide Web,Surface web,Deep Web,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/conceptual_physics_dev.csv:
--------------------------------------------------------------------------------
1 | "Compared with the mass of a uranium atom undergoing fission, the combined masses of the products after fission are",less,more,the same,zero,A
2 | Things that are equivalent according to the equivalence principle are,space and time.,a traveling twin and a stay-at-home twin.,gravity and acceleration.,mass and energy.,C
3 | Colors in a soap bubble result from light,converted to a different frequency,deflection,interference,polarization,C
4 | A model airplane flies slower when flying into the wind and faster with wind at its back. When launched at right angles to the wind a cross wind its groundspeed compared with flying in still air is,the same,greater,less,either greater or less depending on wind speed,B
5 | Which of these three elements has the most mass per nucleon?,Hydrogen,Iron,Uranium,Same in each,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/econometrics_dev.csv:
--------------------------------------------------------------------------------
 1 | "For a stationary autoregressive process, shocks will",Eventually die away,Persist indefinitely,Grow exponentially,Never occur,A
 2 | "Consider the following AR(1) model with the disturbances having zero mean and unit variance
 3 | 
 4 | yt = 0.2 + 0.4 yt-1 + ut
 5 | 
 6 | The (unconditional) mean of y will be given by",0.2,0.4,0.5,0.33,D
 7 | "Suppose that a test statistic has associated with it a p-value of 0.08. Which one of the following statements is true?
 8 | 
 9 | (i) If the size of the test were exactly 8%, we would be indifferent between rejecting and not rejecting the null hypothesis
10 | 
11 | (ii) The null would be rejected if a 10% size of test were used
12 | 
13 | (iii) The null would not be rejected if a 1% size of test were used
14 | 
15 | (iv) The null would be rejected if a 5% size of test were used.",(ii) and (iv) only,(i) and (iii) only,"(i), (ii), and (iii) only","(i), (ii), (iii), and (iv)",C
16 | What would be then consequences for the OLS estimator if heteroscedasticity is present in a regression model but ignored?,It will be biased,It will be inconsistent,It will be inefficient,"All of (a), (b) and (c) will be true.",C
17 | "Suppose now that a researcher wishes to use information criteria to determine the optimal lag length for a VAR. 500 observations are available for the bi-variate VAR, and the values of the determinant of the variance-covariance matrix of residuals are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What is the optimal model order according to Akaike's information criterion?",1 lag,2 lags,3 lags,4 lags,C
18 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/electrical_engineering_dev.csv:
--------------------------------------------------------------------------------
1 | "In an SR latch built from NOR gates, which condition is not allowed","S=0, R=0","S=0, R=1","S=1, R=0","S=1, R=1",D
2 | "In a 2 pole lap winding dc machine , the resistance of one conductor is 2Ω and total number of conductors is 100. Find the total resistance",200Ω,100Ω,50Ω,10Ω,C
3 | "The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density is 1Wb/m2 range of meter is",1 mA.,2 mA.,3 mA.,4 mA.,B
4 | "Two long parallel conductors carry 100 A. If the conductors are separated by 20 mm, the force per meter of length of each conductor will be",100 N.,0.1 N.,1 N.,0.01 N.,B
5 | A point pole has a strength of 4π * 10^-4 weber. The force in newtons on a point pole of 4π * 1.5 * 10^-4 weber placed at a distance of 10 cm from it will be,15 N.,20 N.,7.5 N.,3.75 N.,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/elementary_mathematics_dev.csv:
--------------------------------------------------------------------------------
1 | "The population of the city where Michelle was born is 145,826. What is the value of the 5 in the number 145,826?",5 thousands,5 hundreds,5 tens,5 ones,A
2 | "Olivia used the rule ""Add 11"" to create the number pattern shown below. 10, 21, 32, 43, 54 Which statement about the number pattern is true?",The 10th number in the pattern will be an even number.,The number pattern will never have two even numbers next to each other.,The next two numbers in the pattern will be an even number then an odd number.,If the number pattern started with an odd number then the pattern would have only odd numbers in it.,B
3 | A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?,Add 5 to 30 to find 35 teams.,Divide 30 by 5 to find 6 teams.,Multiply 30 and 5 to find 150 teams.,Subtract 5 from 30 to find 25 teams.,B
4 | A store sells 107 different colors of paint. They have 25 cans of each color in storage. The number of cans of paint the store has in storage can be found using the expression below. 107 × 25. How many cans of paint does the store have in storage?,749,"2,675","2,945","4,250",B
5 | Which expression is equivalent to 5 x 9?,(5 x 4) x (6 x 5),(5 x 5) + (5 x 4),(5 x 5) + (5 x 9),(5 x 9) x (6 x 9),B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/formal_logic_dev.csv:
--------------------------------------------------------------------------------
 1 |  Select the best translation into predicate logic: No people drive on Mars.,~Pd,(∀x)(Px ∨ ~Dx),(∀x)(Px ⊃ ~Dx),~Dp,C
 2 | Select the best translation into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector; l: Hector's lawnmower; Bxyx: x borrows y from z),Blgh,Bhlg,Bglh,Bghl,C
 3 | " Select the best English interpretation of the given arguments in predicate logic.
 4 | Dm
 5 | (∀x)(Wx ⊃ ~Dx)
 6 | (∀x)Wx ∨ Ag	/ (∃x)Ax",Marina is a dancer. Some weaklings are not dancers. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball.,Marina is a dancer. No weakling is a dancer. Everything is either a weakling or plays volleyball. So something plays volleyball.,Marina is a dancer. Some weaklings are not dancers. Everything is either a weakling or plays volleyball. So something plays volleyball.,Marina is a dancer. No weakling is a dancer. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball.,D
 7 | " Construct a complete truth table for the following pairs of propositions. Then, using the truth tables, determine whether the statements are logically equivalent or contradictory. If neither, determine whether they are consistent or inconsistent. Justify your answers.
 8 | E ⊃ (F · E) and ~E · F",Logically equivalent,Contradictory,"Neither logically equivalent nor contradictory, but consistent",Inconsistent,C
 9 | " Which of the given formulas of PL is the best symbolization of the following sentence?
10 | Turtles live long lives and are happy creatures, unless they are injured.",(L • H) ≡ I,(L • H) ∨ I,L • (H ∨ I),L • (H ⊃ R),B
11 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/global_facts_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following pairs of statements are both true (as of 2019)?,People tend to be optimistic about their own future and the future of their nation or the world.,People tend to be optimistic about their own future but pessimistic about the future of their nation or the world.,People tend to be pessimistic about their own future but optimistic about the future of their nation or the world.,People tend to be pessimistic about their own future and the future of their nation or the world.,B
2 | "As of 2019, about what percentage of Americans agree that the state is run for the benefit of all the people?",31%,46%,61%,76%,B
3 | "As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.","increased, increased","increased, decreased","decreased, increased","decreased, decreased",B
4 | "As of 2019, about what percentage of Russians say it is very important to have free media in our country without government/state censorship?",38%,53%,68%,83%,A
5 | "As of 2017, how many of the world’s 1-year-old children today have been vaccinated against some disease? *",80%,60%,40%,20%,A


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_biology_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following is not a way to form recombinant DNA?,Translation,Conjugation,Specialized transduction,Transformation,A
2 | A mutation in a bacterial enzyme changed a previously polar amino acid into a nonpolar amino acid. This amino acid was located at a site distant from the enzyme’s active site. How might this mutation alter the enzyme’s substrate specificity?,By changing the enzyme’s pH optimum,By changing the enzyme’s location in the cell,By changing the shape of the protein,An amino acid change away from the active site cannot alter the enzyme’s substrate specificity.,C
3 | "In animal cells, which of the following represents the most likely pathway that a secretory protein takes as it is synthesized in a cell?",Plasma membrane–Golgi apparatus–ribosome–secretory vesicle–rough ER,Ribosome–Golgi apparatus–rough ER–secretory vesicle–plasma membrane,Plasma membrane–Golgi apparatus–ribosome–secretory vesicle–rough ER,Ribosome–rough ER–Golgi apparatus–secretory vesicle–plasma membrane,D
4 | Which of the following is not known to be involved in the control of cell division?,Cyclins,Protein kinases,Checkpoints,Fibroblast cells,D
5 | Homologous structures are often cited as evidence for the process of natural selection. All of the following are examples of homologous structures EXCEPT,the wings of a bird and the wings of a bat,the flippers of a whale and the arms of a man,the pectoral fins of a porpoise and the flippers of a seal,the forelegs of an insect and the forelimbs of a dog,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_chemistry_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following is considered an acid anhydride?,HCl,H2SO3,SO2,Al(NO3)3,C
2 | Which of the following is expected to be a polar molecule?,PCl4F,BF3,CO2,Si(CH3)4,A
3 | "From the solubility rules, which of the following is true?","All chlorides, bromides, and iodides are soluble",All sulfates are soluble,All hydroxides are soluble,All ammonium-containing compounds are soluble,D
4 | "A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?",3.89,7.78,5.78,2.33,C
5 | "A solution contains 2.00 mole of acetic acid, CH3COOH, and 1.00 mole of calcium acetate, Ca(CH3COO)2. The solution is able to resist the addition of a small amount of strong acid or strong base with only minor changes in the pH of the solution. Larger quantities of strong acid or strong base can cause a significant change in pH. How many moles of nitric acid, HNO3, may be added before the pH begins to change significantly?",0.500 mole,1.00 mole,2.00 mole,3.00 mole,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_computer_science_dev.csv:
--------------------------------------------------------------------------------
 1 | Which of the following is an example of the use of a device on the Internet of Things (IoT) ?,A car alerts a driver that it is about to hit an object.,A hiker uses a G P S watch to keep track of her position.,A refrigerator orders milk from an online delivery service when the milk in the refrigerator is almost gone.,A runner uses a watch with optical sensors to monitor his heart rate.,C
 2 | "Many Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?","The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator.",Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer.,A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session.,A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded.,B
 3 | "What is the output of ""abc""[::-1] in Python 3?",Error,abc,cba,c,C
 4 | "In the program below, the initial value of x is 5 and the initial value of y is 10.
 5 |  IF (X < O)
 6 |  {
 7 |  DISPLAY (""Foxtrot"")
 8 |  }
 9 |  ELSE
10 |  {
11 |  IF (X > y)
12 |  {
13 |    DISPLAY (""Hotel"")
14 |  }
15 |  ELSE 
16 |  {
17 |    IF (y > O)
18 |    {
19 |    DISPLAY (""November"")
20 |    }
21 |    ELSE
22 |    {
23 |    DISPLAY (""Yankee"")
24 |    }
25 |  }
26 |  }
27 |  
28 |  What is displayed as a result of running the program?",Foxtrot,Hotel,November,Yankee,C
29 | "A list of numbers has n elements, indexed from 1 to n. The following algorithm is intended to display the number of elements in the list that have a value greater than 100. The algorithm uses the variables count and position. Steps 3 and 4 are missing.
30 |  Step 1: Set count to 0 and position to 1.
31 |  Step 2: If the value of the element at index position is greater
32 |  than 100, increase the value of count by 1.
33 |  Step 3: (missing step)
34 |  Step 4: (missing step)
35 |  Step 5: Display the value of count.
36 |  Which of the following could be used to replace steps 3 and 4 so that the algorithm works as intended?","Step 3: Increase the value of position by 1.
37 |  Step 4: Repeat steps 2 and 3 until the value of count is greater than 100.","Step 3: Increase the value of position by 1.
38 |  Step 4: Repeat steps 2 and 3 until t he value of position is greater than n.","Step 3: Repeat step 2 until the value of count is greater than 100.
39 |  Step 4: Increase the value of position by 1.","Step 3: Repeat step 2 until the value of position is greater than n.
40 |  Step 4: Increase the value of count by 1.",D
41 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_geography_dev.csv:
--------------------------------------------------------------------------------
1 | The rate of natural increase of a population is found by subtracting the,crude death rate from the crude birth date.,crude birth rate from the crude death rate.,doubling time from the crude birth rate.,fertility rate from the crude death rate.,A
2 | "During the third stage of the demographic transition model, which of the following is true?",Birth rates increase and population growth rate is less rapid.,Birth rates decline and population growth rate is less rapid.,Birth rates increase and population growth rate increases.,Birth rates decrease and population growth rate increases.,B
3 | Which of the following statements is NOT accurate regarding the services provided by local governments in the United States?,Duplication of efforts occurs often.,Social problems of the central city spill over into the surrounding residential suburbs.,Inefficiency in providing services occurs often.,One neighborhood's efforts to reduce pollution are always supported by neighboring communities.,D
4 | The practice of hiring a foreign third-party service provider to run an operation is called,outsourcing.,offshoring.,maquiladoras.,locational interdependence.,B
5 | Which one of the following items is an example of nonmaterial culture?,Dove soap,Dove candy bar,Dove symbol,A dove (bird),C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_government_and_politics_dev.csv:
--------------------------------------------------------------------------------
1 | Uncertainty over the limits to presidential power is caused primarily by the fact that,the constitutional definition of those powers is broad and unspecific,most people agree that the Constitution places too many limits on presidential power,the Supreme Court consistently refuses to rule on cases concerning presidential powers,constitutional amendments have greatly increased presidential powers,A
2 | "The term ""budget deficit"" refers to the",annual increase in federal spending on the military,amount of interest on the national debt,difference between the initial budget proposals made by the president and Congress,amount the government spends in excess of its revenues,D
3 | "Which of the following cases established the precedent that a defendant must be informed of the right to remain silent, the right to a lawyer, and protection from self-incrimination?",Weeks v. United States,Betts v. Brady,Mapp v. Ohio,Miranda v. Arizona,D
4 | Which of the following statements about cabinet departments is FALSE?,They are established by the legislative branch.,Their members often don't have much influence over presidential decisions.,They cannot all be run by leaders who belong to the same political party the president does.,Not every federal agency is a cabinet department.,C
5 | Which of the following best states an argument made by James Madison in The Federalist number 10?,Honest politicians can prevent factions from developing.,Factions are more likely to occur in large republics than in small ones.,The negative effects of factionalism can be reduced by a republican government.,Free elections are the people's best defense against factionalism.,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_macroeconomics_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following is not included in the U.S. GDP?,The U.S. military opens a new base in a foreign country with 1000 U.S. personnel.,Japanese consumers buy thousands of CDs produced in the United States.,An American pop singer performs a sold-out concert in Paris.,A French theatrical production tours dozens of American cities.,C
2 | The short-run Phillips curve indicates a,direct relation between unemployment and inflation,direct relation between price and quantity demanded,inverse relation between price and quantity demanded,inverse relation between unemployment and inflation,D
3 | A federal deficit occurs when,exports exceed imports.,imports exceed exports.,federal tax collections exceed spending.,federal spending exceeds federal tax revenues.,D
4 | Holding all else equal which of the following monetary policies would be used to boost U.S. exports?,Increasing the discount rate,Increasing the reserve ratio,Buying government securities,Lowering tariffs,C
5 | Which of the following policies best describes supply-side fiscal policy?,An increase in the money supply,Increased government spending,Lower taxes on research and development of new technology,Higher taxes on household income,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_mathematics_dev.csv:
--------------------------------------------------------------------------------
1 | "Joe was in charge of lights for a dance. The red light blinks every two seconds, the yellow light every three seconds, and the blue light every five seconds. If we include the very beginning and very end of the dance, how many times during a seven minute dance will all the lights come on at the same time? (Assume that all three lights blink simultaneously at the very beginning of the dance.)",3,15,6,5,B
2 | "Five thousand dollars compounded annually at an $x\%$ interest rate takes six years to double. At the same interest rate, how many years will it take $\$300$ to grow to $\$9600$?",12,1,30,5,C
3 | "The variable $x$ varies directly as the square of $y$, and $y$ varies directly as the cube of $z$. If $x$ equals $-16$ when $z$ equals 2, what is the value of $x$ when $z$ equals $\frac{1}{2}$?",-1,16,-\frac{1}{256},\frac{1}{16},C
4 | Simplify and write the result with a rational denominator: $$\sqrt{\sqrt[3]{\sqrt{\frac{1}{729}}}}$$,\frac{3\sqrt{3}}{3},\frac{1}{3},\sqrt{3},\frac{\sqrt{3}}{3},D
5 | "Ten students take a biology test and receive the following scores: 45, 55, 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students’ test scores?",55,60,62,65,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_microeconomics_dev.csv:
--------------------------------------------------------------------------------
1 | "In a competitive labor market for housepainters, which of the following would increase the demand for housepainters?",An effective minimum wage imposed on this labor market.,An increase in the price of gallons of paint.,An increase in the construction of new houses.,An increase in the price of mechanical painters so long as the output effect exceeds the substitution effect.,C
2 | "If the government subsidizes producers in a perfectly competitive market, then",the demand for the product will increase,the demand for the product will decrease,the consumer surplus will increase,the consumer surplus will decrease,C
3 | The concentration ratio for a monopoly is,0,5,10,100,D
4 | Which of the following is true of a price floor?,The price floor shifts the demand curve to the left.,An effective floor creates a shortage of the good.,The price floor shifts the supply curve of the good to the right.,"To be an effective floor, it must be set above the equilibrium price.",D
5 | Which of the following is necessarily a characteristic of oligopoly?,Free entry into and exit from the market,A few large producers,One producer of a good with no close substitutes,A homogenous product,B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_physics_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following conditions will ensure that angular momentum is conserved? I. Conservation of linear momentum II. Zero net external force III. Zero net external torque,I and II only,I and III only,II and III only,III only,D
2 | "A pipe full of air is closed at one end. A standing wave is produced in the pipe, causing the pipe to sound a note. Which of the following is a correct statement about the wave’s properties at the closed end of the pipe?","The pressure is at a node, but the particle displacement is at an antinode.","The pressure is at an antinode, but the particle displacement is at a node.",The pressure and the particle displacement are both at nodes.,The pressure and the particle displacement are both at antinodes.,B
3 | "A photocell of work function ϕ = 2eV is connected to a resistor in series. Light of frequency f = 1 × 10^15 Hz hits a metal plate of the photocell. If the power of the light is P = 100 W, what is the current through the resistor?",2:00 AM,6:00 AM,12:00 AM,24 A,D
4 | "A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?",10 W,30 W,60 W,240 W,D
5 | "A point charge, Q = +1 mC, is fixed at the origin. How much work is required to move a charge, Q = +8 µC, from the point (0, 4 meters) to the point (3 meters, 0)?",3.5 J,6.0 J,22.5 J,40 J,B


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_psychology_dev.csv:
--------------------------------------------------------------------------------
1 | Ani believes that her attitudes and behavior play a central role in what happens to her. Such a belief is likely to be associated with,a strong superego.,low self-esteem.,low self-efficacy.,an internal locus of control.,D
2 | "According to Caplan's model of consultee-centered case consultation, the consultant is primarily interested in",identifying the causes and solutions of the client's presenting problems,identifying and eliminating the causes of the consultee's difficulties in handling a problem,establishing a hierarchy of authority to enable effective decision making,"presenting a single, well-defined and unambiguous course of action for the consultant to overcome skills deficits",B
3 | "While swimming in the ocean, Ivan is frightened by a dark shadow in the water even before he has the chance to identify what the shadow is. The synaptic connections taking place during this incident of fright are best described by which of the following?",Messages are sent from the thalamus directly to the amygdala.,"Messages are sent from the thalamus to the ""what"" and ""where"" pathways.",Messages are sent from the parasympathetic nervous system to the cerebral cortex.,Messages are sent from the frontal lobes to the pituitary gland.,A
4 | "According to the Individuals with Disabilities Education Improvement Act, which of the following must an educational agency do before it changes the educational placement of a student with a disability?",Give the child a trial period in the new environment,Notify the parents in writing,Obtain school board approval,Obtain parental consent,B
5 | Pascale is interested in the processing strategies children use to learn new information. Pascale would best be classified as what type of psychologist?,sociocultural,clinical,cognitive,behaviorist,C


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_statistics_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following is a correct statement about correlation?,"If the slope of the regression line is exactly 1, then the correlation is exactly 1.","If the correlation is 0, then the slope of the regression line is undefined.",Switching which variable is called x and which is called y changes the sign of the correlation.,The correlation r is equal to the slope of the regression line when z-scores for the y-variable are plotted against z-scores for the x-variable.,D
2 | "Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and var(Y) = 12. What are the expected value and variance of the random variable X + Y?","E(X + Y) = 99, var(X + Y) = 8.5","E(X + Y) = 99, var(X + Y) = 13","E(X + Y) = 99, var(X + Y) = 17",There is insufficient information to answer this question.,D
3 | "After a frost warning was issued, the owner of a large orange grove asked his workers to spray all his trees with water. The water was supposed to freeze and form a protective covering of ice around the orange blossom. Nevertheless, the owner suspected that some trees suffered considerable damage due to the frost. To estimate the proportion of trees that suffered more than 50 percent damage due to the frost, he took a random sample of 100 trees from his grove. What is the response variable in this experiment?",The proportion of trees that suffered more than 50 percent damage due to frost.,The number of trees affected by the frost.,The number of trees sampled from the grove.,"For each sampled tree, whether it suffered more than 50 percent damage or at most 50 percent damage.",D
4 | "A new smartwatch is manufactured in one part of a factory, then secured for shipping in another, independent part of the factory. The weight of the smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging (box, user's guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging would have the following mean and standard deviation:",Mean 518 grams; standard deviation 7.0 grams,Mean 518 grams; standard deviation 3.5 grams,Mean 518 grams; standard deviation 6.1 grams,Mean 394 grams; standard deviation 6.1 grams,C
5 | "Which of the following sets has the smallest standard deviation? Which has the largest?
6 | I: {1,2,3}
7 | II: {-10,10}
8 | III: {100}
9 | ","I, II","II, III","III, I","III, II",D


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_us_history_dev.csv:
--------------------------------------------------------------------------------
 1 | "This question refers to the following information.
 2 | ""Society in every state is a blessing, but government even in its best state is but a necessary evil; in its worst state an intolerable one; for when we suffer, or are exposed to the same miseries by a government, which we might expect in a country without government, our calamity is heightened by reflecting that we furnish the means by which we suffer. Government, like dress, is the badge of lost innocence; the palaces of kings are built on the ruins of the bowers of paradise. For were the impulses of conscience clear, uniform, and irresistibly obeyed, man would need no other lawgiver; but that not being the case, he finds it necessary to surrender up a part of his property to furnish means for the protection of the rest; and this he is induced to do by the same prudence which in every other case advises him out of two evils to choose the least. Wherefore, security being the true design and end of government, it unanswerably follows that whatever form thereof appears most likely to ensure it to us, with the least expense and greatest benefit, is preferable to all others.""
 3 | Thomas Paine, Common Sense, 1776
 4 | Which of the following ""miseries"" alluded to above were most condemned by Anti-Federalists of the post-Revolutionary era?",Organized response to Bacon's Rebellion,Federal response to Shays's Rebellion,Federal response to the Whiskey Rebellion,Federal response to Pontiac's Rebellion,C
 5 | "This question refers to the following information.
 6 | ""As our late Conduct at the Conestoga Manor and Lancaster have occasioned much Speculation & a great diversity of Sentiments in this and neighboring Governments; some vindicating & others condemning it; some charitably alleviating the Crime, & others maliciously painting it in the most odious & detestable Colours, we think it our duty to lay before the Publick, the whole Matter as it appeared, & still appears, to us. . . .
 7 | ""If these things are not sufficient to prove an unjustifiable Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend them & an utter insensibility to human Distresses, let us consider a few more recent Facts. When we found the last Summer that we were likely to get no Assistance from the Government, some Volunteers went out at our own Expense, determined to drive our Enemies from our Borders; & when we came near to the great Island, we understood that a Number of their Warriors had gone out against our Frontiers. Upon this we returned and came up with them and fought with them at the Munfey Hill where we lost some of our Men & killed some of their Warriors & thereby saved our Frontiers from this Story in another Expedition. But no sooner had we destroyed their Provisions on the great Island, & ruined their trade with the good People at Bethlehem, but these very Indians, who were justly suspected of having murdered our Friends in Northampton County, were by the Influence of some Quakers taken under the Protection of the Government to screen them from the Resentments of the Friends and Relations of the Murdered, & to support them thro the Winter.""
 8 | —""Apology of the Paxton Boys"" (pamphlet), 1764 (Note: ""apology"" in this context should be read as an explanation, not an admission of guilt or regret.)
 9 | The sentiments expressed in the explanation above reflect which of the ongoing tensions during the colonial period of American history?",Tensions between British policies and the aspirations of North American colonists.,Tensions between American Indians allied with the French and those allied with the British.,Tensions between freed African Americans and white planters.,Tensions between backcountry settlers and elites within colonial America.,D
10 | "This question refers to the following information.
11 | ""In the new Code of Laws which I suppose it will be necessary for you to make I desire you would Remember the Ladies, and be more generous and favorable to them than your ancestors. Do not put such unlimited power into the hands of the Husbands. Remember all Men would be tyrants if they could. If particular care and attention is not paid to the Ladies we are determined to foment a Rebellion, and will not hold ourselves bound by any Laws in which we have no voice, or Representation.""
12 | Abigail Adams, in a letter to John Adams, 1776
13 | ""Special legislation for woman has placed us in a most anomalous position. Women invested with the rights of citizens in one section—voters, jurors, office-holders—crossing an imaginary line, are subjects in the next. In some States, a married woman may hold property and transact business in her own name; in others, her earnings belong to her husband. In some States, a woman may testify against her husband, sue and be sued in the courts; in others, she has no redress in case of damage to person, property, or character. In case of divorce on account of adultery in the husband, the innocent wife is held to possess no right to children or property, unless by special decree of the court. But in no State of the Union has the wife the right to her own person, or to any part of the joint earnings of the co-partnership during the life of her husband. In some States women may enter the law schools and practice in the courts; in others they are forbidden. In some universities girls enjoy equal educational advantages with boys, while many of the proudest institutions in the land deny them admittance, though the sons of China, Japan and Africa are welcomed there. But the privileges already granted in the several States are by no means secure.""
14 | Susan B. Anthony, ""Declaration of Rights for Women,"" July 4, 1876
15 | The sentiments expressed in the second excerpt by Susan B. Anthony are most likely in support of",the Equal Rights Amendment,universal suffrage,states' rights,prohibition,B
16 | "This question refers to the following information.
17 | Our leaders talk about stopping aggression from the north, but this was a struggle among groups of Vietnamese until we intervened. We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill them and demolish their country to do it. As the native people survey bombed-out villages, women and children burned by napalm, rice crops destroyed and cities overrun with our military personnel, they are doubtless saying secretly of the Vietcong guerillas and of the American forces, ""A plague on both your houses."" … Stop the bombing, north and south, end search and destroy offensive sweeps, and confine our military action to holding operations on the ground. Bombing the north has failed to halt or seriously check the flow of troops to the south and may, in fact, have prompted a much greater war effort by Hanoi.
18 | —Senator George McGovern, ""The Lessons of Vietnam,"" April 25, 1967
19 | Which of the following opinions from the 1960s most directly reflects the perspective of George McGovern's speech?",Americans must maximize their technological edge in Vietnam.,American bombing in Vietnam is step by step leading to progress in the war.,American bombing in Vietnam is a failure.,America must not give in to defeatism about the war in Vietnam.,C
20 | "This question refers to the following information.
21 | I come not to urge personal claims, nor to seek individual benefits; I appear as the advocate of those who cannot plead their own cause; I come as the friend of those who are deserted, oppressed, and desolate. In the Providence of God, I am the voice of the maniac whose piercing cries from the dreary dungeons of your jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses. I am the Revelation of hundreds of wailing, suffering creatures, hidden in your private dwellings, and in pens and cabins—shut out, cut off from all healing influences, from all mind-restoring cares.… Could their melancholy histories be spread before you as revealed to my grieved spirit during the last three months, how promptly, how earnestly would you search out the most approved means of relief; how trifling, how insignificant, by comparison, would appear the sacrifices you are asked to make; how would a few dimes and dollars, gathered from each citizen, diminish in value as a possession, compared with the certain benefits and vast good to be secured for the suffering insane...by the consecration and application of a sufficient fund to the construction of a suitable hospital.…
22 | —Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of the Insane,
23 | Submitted to the General Assembly of North Carolina, November 1848
24 | Dorothea Dix can best be compared to whom?",Abigail Adams,Clara Barton,Shirley Temple,Hillary Clinton,B
25 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/high_school_world_history_dev.csv:
--------------------------------------------------------------------------------
 1 | "This question refers to the following information.
 2 | He contains all works and desires and all perfumes and all tastes. He enfolds the whole universe and in silence is loving to all. This is the Spirit that is in my heart, this is Brahman. To him I shall come when I go beyond this life, and to him will come he who has faith and doubts not.
 3 | —The Upanishads, India, c. 1000 BCE
 4 | To which religion does the speaker most likely belong?",Hinduism,Buddhism,Shintoism,Zoroastrianism,A
 5 | "This question refers to the following information.
 6 | ""The struggle against neo-colonialism is not aimed at excluding the capital of the developed world from operating in less developed countries. It is aimed at preventing the financial power of the developed countries being used in such a way as to impoverish the less developed.
 7 | Non-alignment, as practiced by Ghana and many other countries, is based on co-operation with all States whether they be capitalist, socialist or have a mixed economy. Such a policy, therefore, involves foreign investment from capitalist countries, but it must be invested in accordance with a national plan drawn up by the government of the non-aligned State with its own interests in mind. The issue is not what return the foreign investor receives on his investments…The question is one of power. A State in the grip of neo-colonialism is not master of its own destiny.""
 8 | Kwame Nkrumah, Neo-Colonialism, 1965
 9 | Which of the following provides the best context for Nkrumah's writings?",The Industrial Revolution,Decolonization,Regional Free Trade Associations,Autarky,B
10 | "This question refers to the following information.
11 | ""The real grievance of the worker is the insecurity of his existence; he is not sure that he will always have work, he is not sure that he will always be healthy, and he foresees that he will one day be old and unfit to work. If he falls into poverty, even if only through a prolonged illness, he is then completely helpless, exam_ins to his own devices, and society does not currently recognize any real obligation towards him beyond the usual help for the poor, even if he has been working all the time ever so faithfully and diligently. The usual help for the poor, however, leaves a lot to be desired, especially in large cities, where it is very much worse than in the country.""
12 | Otto von Bismarck, 1884
13 | Otto von Bismarck likely made this speech in reaction to which of the following issues?",Social acceptance of child labor,Declining life expectancy in Germany,Criticisms of German trade tariffs,Negative effects attributed to industrial capitalism,D
14 | "This question refers to the following information.
15 | ""Indeed, as both the fatwas of distinguished [scholars] who base their opinion on reason and tradition alike and the consensus of the Sunni community agree that the ancient obligation of extirpation, extermination, and expulsion of evil innovation must be the aim of our exalted aspiration, for ""Religious zeal is a victory for the Faith of God the Beneficent""; then, in accordance with the words of the Prophet (Peace upon him!) ""Whosoever introduces evil innovation into our order must be expelled"" and ""Whosoever does aught against our order must be expelled,"" action has become necessary and exigent…""
16 | Letter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514
17 | The letter from Selim I is most clearly an example of which of the following?",The maintenance of military supremacy at all costs,Expanding tensions between religious sects,Factors that brought about the collapse of the Ottoman Empire,Peacemaking efforts among the Islamic empires,B
18 | "This question refers to the following information.
19 | ""At least one of the [world's] societies would have to somehow enormously increase its productivity [in order to achieve global hegemony]. That quantum jump would have to be made before the various scientific, technological, agricultural, and industrial revolutions on which our post-quantum-leap world rests. It could only be accomplished by exploiting the ecosystems, mineral resources, and human assets of whole continents outside the lands of the society making the jump. Western Europe did just that by means of its brutality and guns and, more important, by geographical and ecological luck.""
20 | Copyright © 2015 Cambridge University Press.
21 | Alfred Crosby, historian, Ecological Imperialism, 2004
22 | The ""quantum jump"" mentioned in the passage most directly contributed to which of the following developments in the period 1450–1750 C.E.?",A breakdown in trade routes through the collapse of the established state structure,An increase in the population of the world through more plentiful supplies of food,The spread of Chinese and Indian belief systems across the world,An increase in social unrest,B
23 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/human_aging_dev.csv:
--------------------------------------------------------------------------------
1 | "Which of the following persons is more likely to remain at home alone, as of 2019?",An Asian man or woman,A Hispanic man,An African American woman,A white man or woman,C
2 | The finding that adults tend to remember events from their adolescence better than from other periods in their lives is referred to as the,Adolescence advantage,Reminiscence bump,Memorial memorial,Quadratic retrieval spike,B
3 | "When older adults move to a new state after retirement, which of the following is the more likely destination?",Texas,California,Hawaii,Vermont,A
4 | Which element in tobacco smoke is responsible for cancers?,Nicotine,Tar,Carbon monoxide,Smoke particles,B
5 | "All other things being equal, which of the following persons is more likely to show osteoporosis?",An older Hispanic American woman,An older African American woman,An older Asian American woman,An older Native American woman,C


--------------------------------------------------------------------------------
/eval/mmlu/dev/human_sexuality_dev.csv:
--------------------------------------------------------------------------------
1 | Morning sickness is typically a problem:,during the first trimester,during the second trimester,during the third trimester,all through the pregnancy,A
2 | "A woman who knows she has active herpes and untreated syphilis but continues to have sex without informing her partners of her condition has, in psychoanalytic terms:",a strong ego,a weak superego,a weak id,a strong superego,B
3 | Women's ability to have multiple orgasms is primarily due to:,the fact that they do not have a refractory period.,the response of the inner layers of the vagina.,having alternating orgasms in different locations.,the G-Spot.,A
4 | The nature of homosexual activities that occur during preadolescence include all but which of the following?,sexual intercourse,circle jerks,exhibitionism,touching each other's genitals,A
5 | The most common disorder among men who seek sexual therapy is:,premature ejaculation,inhibited ejaculation,erectile disorder,ejaculatory disorder,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/international_law_dev.csv:
--------------------------------------------------------------------------------
1 | What types of force does Article 2(4) of the UN Charter prohibit?,Article 2(4) encompasses only armed force,"Article 2(4) encompasses all types of force, including sanctions",Article 2(4) encompasses all interference in the domestic affairs of States,Article 2(4) encompasses force directed only against a State's territorial integrity,A
2 | What is the judge ad hoc?,"If a party to a contentious case before the ICJ does not have a national sitting as judge, it is entitled to nominate someone as a judge solely for that case, with the title of judge ad hoc",Judge ad hoc is the member of the bench of the ICJ with a casting vote,"Judge ad hoc is a surrogate judge, in case a judge is disqualified or passes away",Judge ad hoc is the judge that each party will always nominate in every contentious case,A
3 | Would a reservation to the definition of torture in the ICCPR be acceptable in contemporary practice?,This is an acceptable reservation if the reserving country's legislation employs a different definition,This is an unacceptable reservation because it contravenes the object and purpose of the ICCPR,This is an unacceptable reservation because the definition of torture in the ICCPR is consistent with customary international law,This is an acceptable reservation because under general international law States have the right to enter reservations to treaties,B
4 | When 'consent' can serve as a circumstance precluding the wrongfulness of a State conduct?,Consent can serve as a circumstance precluding the wrongfulness whenever it is given,Consent can never serve as a circumstance precluding wrongfulness,"Consent can serve as a circumstance precluding wrongfulness, provided the consent is valid and to the extent that the conduct remains within the limits of the consent given","Consent can always serve as a circumstance precluding wrongfulness, no matter which organ of the State gives it",C
5 | How the consent to be bound of a State may be expressed?,The consent of a State to be bound is expressed only by ratification,"The consent of a state to be bound by a treaty may be expressed by signature, ratification, acceptance, approval or accession",The consent of a State to be bound is expressed by signature,The consent of a State to be bound is expressed by whatever means they choose,B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/jurisprudence_dev.csv:
--------------------------------------------------------------------------------
1 | Which position does Rawls claim is the least likely to be adopted by the POP (people in the original position)?,The POP would choose equality above liberty.,The POP would opt for the 'maximin' strategy.,The POP would opt for the 'difference principle'.,The POP would reject the 'system of natural liberty.',A
2 | Functions of the law include all but which of the following?,maximizing individual freedom,providing a basis for compromise,keeping the peace,promoting the principles of the free enterprise system,D
3 | Which word best summarizes Weber's explanation of the development of formally rational law?,Authority.,Charisma.,Co-operation.,Capitalism.,D
4 | "The ________ School of jurisprudence postulates that the law is based on what is ""correct.""",Natural Law,Analytical,Historical,Sociological,A
5 | "Iverson Jewelers wrote a letter to Miller, 'We have received an exceptionally fine self winding Rolox watch which we will sell to you at a very favorable price.'",The letter is an offer to sell,A valid offer cannot be made by letter.,The letter contains a valid offer which will terminate within a reasonable time.,The letter lacks one of the essential elements of an offer.,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/logical_fallacies_dev.csv:
--------------------------------------------------------------------------------
1 | "If someone attacks the character of an opposing arguer, instead of responding to that opponent's arguments, the first person has probably committed which of the following fallacies?",tu quoque,horse laugh,argument against the person,ignoratio elenchi,C
2 | The complex question fallacy consists of,arguing something is inferior just because it doesn't do something it was never intended to do.,including more than one claim in the proposition and treating proof for one claim as proof for all the claims.,"drawing a conclusion before examining the evidence, and only considering evidence that supports that conclusion.","asking a question that includes either an unproven assumption or more than one question, thus making a straightforward yes or no answer meaningless.",D
3 | Which of the following is true of a valid categorical syllogism?,The minor premise must deny the antecedent,The major premise must affirm the consequent,The middle term must be used in at least one premise in a universal or unqualified sense,All of the above,C
4 | Arguing that what is true of the parts must be true of the whole is the fallacy of...,Division,Composition,Appeal to the person,Appeal to ignorance,B
5 | "When an arguer causes confusion during refutation because of real or feigned lack of an ability to engage in refutation, that arguer may have committed the fallacy of",poor sportsmanship,appeal to compassion,argument against the person,ignorance of refutation,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/machine_learning_dev.csv:
--------------------------------------------------------------------------------
1 | "A 6-sided die is rolled 15 times and the results are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times; side 6: 5 times. Based on these results, what is the probability of side 3 coming up when using Add-1 Smoothing?",2.0/15,1.0/7,3.0/16,1.0/5,B
2 | Which image data augmentation is most common for natural images?,random crop and horizontal flip,random crop and vertical flip,posterization,dithering,A
3 | "You are reviewing papers for the World’s Fanciest Machine Learning Conference, and you see submissions with the following claims. Which ones would you consider accepting? ",My method achieves a training error lower than all previous methods!,My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter λ is chosen so as to minimise test error.),My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter λ is chosen so as to minimise cross-validaton error.),My method achieves a cross-validation error lower than all previous methods! (Footnote: When regularisation parameter λ is chosen so as to minimise cross-validaton error.),C
4 | "To achieve an 0/1 loss estimate that is less than 1 percent of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality the IID test set must have how many examples?",around 10 examples,around 100 examples,between 100 and 500 examples,more than 1000 examples,D
5 | "Traditionally, when we have a real-valued input attribute during decision-tree learning we consider a binary split according to whether the attribute is above or below some threshold. Pat suggests that instead we should just have a multiway split with one branch for each of the distinct values of the attribute. From the list below choose the single biggest problem with Pat’s suggestion:",It is too computationally expensive.,It would probably result in a decision tree that scores badly on the training set and a testset.,It would probably result in a decision tree that scores well on the training set but badly on a testset.,It would probably result in a decision tree that scores well on a testset but badly on a training set.,C


--------------------------------------------------------------------------------
/eval/mmlu/dev/management_dev.csv:
--------------------------------------------------------------------------------
1 | What are the two main dimensions of the Ohio Studies into leadership?,Starting position and end position,Initial environment and changed environment,Organisational structure and conditioning,Initiating structure and considerations,D
2 | Hygiene factors are associated with which writer?,Frederick Hertzberg,D.C. McClelland,Abraham Maslow,Douglas McGregor,A
3 | Which element of the cultural web forms regalia?,Symbols,Rituals and routines,Power structures,Control systems,A
4 | What characteristic is not a key feature of the 'open systems' model of management?,Morale,Innovation,Growth resource,Adaptation,A
5 | How can organisational structures that are characterised by democratic and inclusive styles of management be described?,Hierarchical,Bureaucratic,Flat,Functional,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/marketing_dev.csv:
--------------------------------------------------------------------------------
1 |  _____________ is a natural outcome when combining demographic and geographic variables.,Geodemographics,Product differentiation.,ANSOFF matrix.,Brand management.,A
2 | "In an organization, the group of people tasked with buying decisions is referred to as the _______________.",Outsourcing unit.,Procurement centre.,Chief executive unit.,Decision-making unit.,D
3 |  Which of the following is an assumption in Maslow's hierarchy of needs?,Needs are dependent on culture and also on social class.,Lower-level needs must be at least partially satisfied before higher needs can affect behaviour.,Needs are not prioritized or arranged in any particular order.,"Satisfied needs are motivators, and new needs emerge when current needs remain unmet.",B
4 | The single group within society that is most vulnerable to reference group influence is:,The older consumer who feels somewhat left out of things.,"The married women, many of whom feel a need for stability in their lives.",New immigrants who really want to assimilate into their new culture.,"Children, who base most of their buying decisions on outside influences.",D
5 | "Although the content and quality can be as controlled as direct mail, response rates of this medium are lower because of the lack of a personal address mechanism. This media format is known as:",Care lines.,Direct mail.,Inserts.,Door to door.,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/medical_genetics_dev.csv:
--------------------------------------------------------------------------------
1 | Large triplet repeat expansions can be detected by:,polymerase chain reaction.,single strand conformational polymorphism analysis.,Southern blotting.,Western blotting.,C
2 | DNA ligase is,an enzyme that joins fragments in normal DNA replication,an enzyme of bacterial origin which cuts DNA at defined base sequences,an enzyme that facilitates transcription of specific genes,an enzyme which limits the level to which a particular nutrient reaches,A
3 | A gene showing codominance,has both alleles independently expressed in the heterozygote,has one allele dominant to the other,has alleles tightly linked on the same chromosome,has alleles expressed at the same time in development,A
4 | Which of the following conditions does not show multifactorial inheritance?,Pyloric stenosis,Schizophrenia,Spina bifida (neural tube defects),Marfan syndrome,D
5 | The stage of meiosis in which chromosomes pair and cross over is:,prophase I,metaphase I,prophase II,metaphase II,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/miscellaneous_dev.csv:
--------------------------------------------------------------------------------
1 | How many axles does a standard automobile have?,one,two,four,eight,B
2 | What place is named in the title of the 1979 live album by rock legends Cheap Trick?,Budapest,Budokan,Bhutan,Britain,B
3 | Who is the shortest man to ever win an NBA slam dunk competition?,Anthony 'Spud' Webb,Michael 'Air' Jordan,Tyrone 'Muggsy' Bogues,Julius 'Dr J' Erving,A
4 | What is produced during photosynthesis?,hydrogen,nylon,oxygen,light,C
5 | Which of these songs was a Top 10 hit for the rock band The Police?,'Radio Ga-Ga','Ob-la-di Ob-la-da','De Do Do Do De Da Da Da','In-a-Gadda-Da-Vida',C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/moral_disputes_dev.csv:
--------------------------------------------------------------------------------
1 | "According to Metz, what is wrong with consequentialist arguments against capital punishment based on African values?",It is unclear as of yet whether or not capital punishment deters harm to the community.,It is unclear as of yet whether or not capital punishment deters harm to any individuals.,Consequentialism is not supported by African values.,"Even though consequentialism is supported by African values, no consequentialist arguments framed in terms of African values have been offered.",A
2 | A fertilized ovum is also known as,a zygote.,an embryo.,a viability.,a blastocyst.,A
3 | "Which of the following is an intrinsic good, according to Singer?",being in the upper class,the ability to learn and solve problems,being more beautiful than most of the population,being shorter than average,B
4 | "Pence compares six different cases of reproduction, from natural twinning to SCNT. What conclusion does he draw from this comparison?",SCNT is not a different kind of reproduction because there are no morally relevant differences between it and other permissible means of reproduction.,"Because there is a low risk of harm for natural twinning, there will be a low risk of harm for SCNT.",both A and B,neither A nor B,A
5 | "Baron admits that the versions of the ticking bomb hypothetical she discusses are ""stunningly stupid,"" but she claims this is actually evidence of",the stupidity of most traditional philosophical examples.,a general lack of intelligence among people with advanced degrees.,the wrongness of torture.,the readiness on the part of many intelligent people to see torture as the best solution to deal with terrorism.,D


--------------------------------------------------------------------------------
/eval/mmlu/dev/moral_scenarios_dev.csv:
--------------------------------------------------------------------------------
1 | "For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.","Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",C
2 | "For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store. Scenario 2 | When the security guard left his post, I pocketed a rare coin.","Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",C
3 | "For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I looked up at the gorgeous waterfall and threw a rock at it.","Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",D
4 | "For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | During the house's demolition, I pushed him into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don't care about her racial background.","Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",B
5 | "For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario 2 | I made love to my newborn baby boy.","Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",A


--------------------------------------------------------------------------------
/eval/mmlu/dev/nutrition_dev.csv:
--------------------------------------------------------------------------------
 1 | "In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which one statement is correct (according to knowledge in 2020)?
 2 | ",The tea drinkers have lower risk of developing diabetes.,The tea drinkers have higher risk of developing diabetes.,Based on the information given we cannot tell if the observed difference in disease risk is the result of chance.,"The risk ratio is close to the value one, so there is no difference in disease risk between the two groups.",C
 3 | "Which of the following statements is correct (according to knowledge in 2020)?
 4 | ",Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame,Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin,Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose,Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame K,A
 5 | "Which of the following is the most plausible explanation for the protective effect of dietary fibre against cancer of the colon, as of 2020?
 6 | ","Propionic acid, formed during colonic fibre fermentation inhibits liver fatty acid synthesis","Butyric acid, formed during colonic fibre fermentation stimulates ""silencing"" of the SLC5A8 tumour suppressor gene",None of these options are correct,"Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant defences in the colon",D
 7 | "Which of the following statements about iodine is correct, as of 2020?
 8 | ",50% of adults consume iodine at levels below the RNI,Dairy products are a poor source of iodine,The iodine content of organic milk is generally lower that the level in non-organic milk,UK dietary reference values recommend an increase in iodine intake in pregnancy,C
 9 | "What is the first-line drug for patients with type 2 diabetes and obesity, as of 2020?
10 | ",Acarbose,Metformin,Sulphonylureas,Insulin,B


--------------------------------------------------------------------------------
/eval/mmlu/dev/philosophy_dev.csv:
--------------------------------------------------------------------------------
1 | Psychological egoism is:,an ethical theory about how we ought to behave.,a generalization concerning the way people tend to behave.,a claim about human nature and the ways people are capable of behaving.,none of the above.,C
2 | "According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:",pleasure.,happiness.,good.,virtue.,C
3 | "According to d'Holbach, people always act according to _____.",free choices,dictates of the soul,necessary natural laws,undetermined will,C
4 | "Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?",optimist,satisfied,nominally religious,pessimist,D
5 | "The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.",metaphysics,epistemology,quantum physics,axiology,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/prehistory_dev.csv:
--------------------------------------------------------------------------------
1 | The great Mayan king Pacal built temples in the city of Palenque in order to:,satisfy the powerful Mayan astronomer priests.,"display his generosity to the common people, since they were allowed to live in the temples.","frighten away enemies, in particular the Spaniards.","legitimize his kingship, since his father was not royal.",D
2 | "According to Timothy Pauketat, the evidence for social stratification and political power at Cahokia suggests:",a center of Mississippian civilization with conditions similar to the rise of early states.,the limitations of authority in a Native American society of egalitarian foragers.,a simple chiefdom or perhaps a complex chiefdom had evolved by A.D. 1500.,a center of Mississippian civilization with conditions similar to societies on the Northwest Coast of North America.,A
3 | Researchers now believe that the decline of the Maya was caused chiefly by:,"a cataclysm of some kind, such as an earthquake, volcano, or tsunami.",ecological degradation resulting from slash-and-burn farming techniques.,endless wars between neighboring Mayan city-states.,practices of interbreeding that led to a steep rise in congenital disorders.,B
4 | Recent research on hominid species dating from the Middle Pliocene indicates there was (as of 2020):,"a great amount of species diversity, or a single species that exhibited a lot of diversity.",very little species diversity during this period and very few hominids.,decreased species diversity due to a prolonged ice age followed by a severe drought.,"decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.",A
5 | What is the approximate mean cranial capacity of Homo erectus?,under 650 cc,about 800 cc,just under 1000 cc,1200 cc,C


--------------------------------------------------------------------------------
/eval/mmlu/dev/professional_accounting_dev.csv:
--------------------------------------------------------------------------------
1 | Box a nongovernmental not-for-profit organization had the following transactions during the year: Proceeds from sale of investments $80000 Purchase of property plant and equipment $10000 Proceeds from long-term debt $100000 Loss on sale of investment $5000 What amount should be reported as net cash provided by financing activities in Box's statement of cash flows?,"$70,000","$75,000","$80,000",100000,D
2 | "One hundred years ago, your great-great-grandmother invested $100 at 5% yearly interest. What is the investment worth today?","$13,000",$600,"$15,000","$28,000",A
3 | "Krete is an unmarried taxpayer with income exclusively from wages. By December 31, year 1, Krete's employer has withheld $16,000 in federal income taxes and Krete has made no estimated tax payments. On April 15, year 2, Krete timely filed for an extension request to file her individual tax return, and paid $300 of additional taxes. Krete's year 1 tax liability was $16,500 when she timely filed her return on April 30, year 2, and paid the remaining tax liability balance. What amount would be subject to the penalty for underpayment of estimated taxes?",$0,$500,"$1,650","$16,500",A
4 | "On January 1, year 1, Alpha Co. signed an annual maintenance agreement with a software provider for $15,000 and the maintenance period begins on March 1, year 2. Alpha also incurred $5,000 of costs on January 1, year 1, related to software modification requests that will increase the functionality of the software. Alpha depreciates and amortizes its computer and software assets over five years using the straight-line method. What amount is the total expense that Alpha should recognize related to the maintenance agreement and the software modifications for the year ended December 31, year 1?","$5,000","$13,500","$16,000","$20,000",B
5 | An auditor traces the serial numbers on equipment to a nonissuer’s subledger. Which of the following management assertions is supported by this test?,Valuation and allocation,Completeness,Rights and obligations,Presentation and disclosure,B
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/professional_law_dev.csv:
--------------------------------------------------------------------------------
1 | "A state legislature has recently enacted a statute making it a misdemeanor to curse or revile or use obscene or opprobrious language toward or in reference to a police officer perfonning his duties. A student at a state university organized a demonstration on campus to protest the war. The rally was attended by a group of 50 students who shouted anti-war messages at cars passing by. To show his contempt for the United States, the student sewed the American flag to the rear of his jeans. When a police officer saw the flag sown on the student's jeans, he approached and told him to remove the flag or he would be placed under arrest. The student became angered and shouted at the police officer, ""Listen, you bastard, I'll wear this rag anywhere I please. "" The student was subsequently placed under arrest and charged with violating the state statute. The student subsequently brings suit in state court challenging the constitutionality of the statute. The strongest constitutional argument for the student is that",the statute is void for vagueness under the Fourteenth Amendment's due process clause.,the statute is invalid because it violates the petitioner's freedom of speech under the First Amendment.,the statute is an abridgment of freedom of speech under the First Amendment because less restrictive means are available for achieving the same purpose.,the statute is overbroad and consequently invalid under the First and FourteenthAmendments.,D
2 | "A state has recently enacted a statute prohibiting the disposal of any nuclear wastes within the state. This law does not contravene or conflict with any federal statutes. A man operates a company in the state that is engaged in the disposal of nuclear wastes. Subsequent to the passage of the state statute, the man, not yet aware of the new law, entered into contracts with many out-of-state firms to dispose of their nuclear wastes in the state. On account of this new law, however, the man will be unable to perform these contracts. Assume that the man has standing to challenge this state law. Which of the following presents his strongest constitutional grounds to challenge the state law prohibiting the disposal of nuclear wastes within the state?",The commerce clause.,The equal protection clause of the Fourteenth Amendment.,"The privileges and immunities clause of Article IV, Section 2. ",The contract clause.,A
3 | Judge took judicial notice of some facts at the beginning of the trial. Which of the following is not an appropriate kind of fact for judicial notice?,Indisputable facts.,Facts that have been asserted by individual political organizations.,Facts recognized to be true by common knowledge.,Facts capable of scientific verification.,B
4 | "On October 1, 1980, a developer, owner of several hundred acres in a rural county, drafted a general development plan for the area. The duly recorded plan imposed elaborate limitations and restrictions upon the land in the plan, which was to be developed as a residential district. The restrictions were to extend to all persons acquiring any of the lots and to their heirs, assigns, and lessees. It was further provided that all subsequent owners would be charged with due notice of the restrictions. Among those restrictions in the general plan were the following:(22) A franchise right is created in a strip of land 10 feet in width along the rear of each lot for the use of public utility companies with right of ingress and egress. (23) No house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. In 2000, a retiree purchased one of the lots, built a house, and erected a fence in the rear of his property within the restricted area. In 2004, a teacher purchased a lot adjacent to the retiree's property and built a new house. Two years later, a librarian purchased the lot that adjoined the teacher's property. The three deeds to those properties each contained references to the deed book where the general plan was recorded. In 2008, the librarian began the construction of a seven-foot post-and-rail fence along the line dividing his lot with the teacher's, and along the center of the area subject to the franchise right. Although the teacher objected to its construction, the fence was completed. If the teacher seeks a mandatory injunction to compel removal of the librarian's fence, the court will most likely","grant relief, because the fence was in violation of the easement restriction. ","grant relief, because the encroachment of the fence violated the restriction in the original plan. ","deny relief, because the teacher failed to enforce the restriction against the retiree. ","deny relief, because the fence would not be construed as ""a structure"" within the terms of the restriction. ",B
5 | "A son owed a creditor $5,000. The son's father contacted the creditor and told him that he wanted to pay the son's debt. The father signed a document that stated the father would pay the son's debt at a rate of $500 a month for 10 months. The creditor made no written or oral commitment to forbear to sue the son to collect the $5,000 debt, and the father made no oral or written request for any such forbearance. For the next five months, the father made and the creditor accepted the $500 monthly payments as agreed. During that period, the creditor, in fact, did forbear to take any legal action against the son. However, the father then informed the creditor that he would make no further payments on the debt. Which of the following is the most persuasive argument that the father is liable to the creditor under the terms of their agreement?","The father's promise and the creditor's reliance thereon, if proved, gave rise to a valid claim by the creditor against the father based on the doctrine of promissory estoppel. ","Because it was foreseeable that the father's promise would induce the creditor to forbear taking any action against the son, such forbearance was, as a matter of law, a bargained-for consideration for the father's promise. ","The father's five payments to the creditor totaling $2,500 manifested a serious intent on the father's part to be contractually bound, and such manifestation is generally recognized as an effective substitute for consideration. ","By assuming the antecedent debt obligation that the son owed to the creditor, the father became a surety whose promise to the creditor was enforceable, since it was in writing and supported by adequate consideration. ",A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/professional_medicine_dev.csv:
--------------------------------------------------------------------------------
1 | "A 42-year-old man comes to the office for preoperative evaluation prior to undergoing adrenalectomy scheduled in 2 weeks. One month ago, he received care in the emergency department for pain over his right flank following a motor vehicle collision. At that time, blood pressure was 160/100 mm Hg and CT scan of the abdomen showed an incidental 10-cm left adrenal mass. Results of laboratory studies, including complete blood count, serum electrolyte concentrations, and liver function tests, were within the reference ranges. The patient otherwise had been healthy and had never been told that he had elevated blood pressure. He takes no medications. A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine and metanephrine and plasma aldosterone concentrations. The patient was referred to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature 36.6°C (97.9°F), pulse 100/min, respirations 14/min, and blood pressure 170/95 mm Hg. Physical examination discloses no significant findings. Initial preoperative preparation should include treatment with which of the following?",Labetalol,A loading dose of potassium chloride,Nifedipine,Phenoxybenzamine,D
2 | "A 36-year-old male presents to the office with a 3-week history of low back pain. He denies any recent trauma but says that he climbs in and out of his truck numerous times a day for his job. Examination of the patient in the prone position reveals a deep sacral sulcus on the left, a posterior inferior lateral angle on the right, and a lumbosacral junction that springs freely on compression. The most likely diagnosis is",left-on-left sacral torsion,left-on-right sacral torsion,right unilateral sacral flexion,right-on-right sacral torsion,D
3 | "A previously healthy 32-year-old woman comes to the physician 8 months after her husband was killed in a car crash. Since that time, she has had a decreased appetite and difficulty falling asleep. She states that she is often sad and cries frequently. She has been rechecking the door lock five times before leaving her house and has to count exactly five pieces of toilet paper before she uses it. She says that she has always been a perfectionist but these urges and rituals are new. Pharmacotherapy should be targeted to which of the following neurotransmitters?",Dopamine,Glutamate,Norepinephrine,Serotonin,D
4 | "A 44-year-old man comes to the office because of a 3-day history of sore throat, nonproductive cough, runny nose, and frontal headache. He says the headache is worse in the morning and ibuprofen does provide some relief. He has not had shortness of breath. Medical history is unremarkable. He takes no medications other than the ibuprofen for pain. Vital signs are temperature 37.4°C (99.4°F), pulse 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the nares shows erythematous mucous membranes. Examination of the throat shows erythema and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable cervical adenopathy. Lungs are clear to auscultation. Which of the following is the most likely cause of this patient's symptoms?",Allergic rhinitis,Epstein-Barr virus,Mycoplasma pneumoniae,Rhinovirus,D
5 | A 22-year-old male marathon runner presents to the office with the complaint of right-sided rib pain when he runs long distances. Physical examination reveals normal heart and lung findings and an exhalation dysfunction at ribs 4-5 on the right. Which of the following muscles or muscle groups will be most useful in correcting this dysfunction utilizing a direct method?,anterior scalene,latissimus dorsi,pectoralis minor,quadratus lumborum,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/professional_psychology_dev.csv:
--------------------------------------------------------------------------------
1 | "One of your therapy clients asks your advice about a good weight- reduction program. You have investigated the programs in the community and are enrolled in the one you consider the best. This program offers a $50 bonus to its patrons for each new person they bring into the program. Under these circumstances, your most appropriate response would be to",tell your client the pros and cons of each program you know about except for the one in which you are enrolled,recommend to your client the program in which you are enrolled and explain the $50 bonus you will receive,recommend to your client the program in which you are enrolled and offer to have the $50 bonus credited to your client's account in the program,"tell your client the pros and cons of each program you know about, but do not claim the $50 bonus if your client enrolls in your program",D
2 | "There are three ways to measure the Central Tendency: the Mean, the Median and the Mode. From your knowledge about them, what is the mode?",less sensitive to extreme scores than the mean,more useful for skewed distributions,sensitive to extreme values and highly skewed distributions,the most frequently occurring number,D
3 | "In terms of Hofstede’s (1980) five cultural dimensions, the United States scores at the top of the scale on:",individualism.,individualism and power distance.,power distance and masculinity.,uncertainty avoidance.,A
4 | Carl Jung believed that a client's transference:,is a fantasy that distracts the client from reality.,represents “mixed feelings” toward the therapist. ,"""is a form of """"acting out.""""""",reflects the client’s personal and collective unconscious.,D
5 | "In the construction of a multiple regression equation for purposes of prediction, the optimal combination of measures is one in which the predictors",are uncorrelated with each other but are moderately correlated with the criterion,have low correlations with each other and low correlations with the criterion,are highly intercorrelated with each other and moderately correlated with the criterion,have low correlations with the criterion bur are moderately correlated with each other,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/public_relations_dev.csv:
--------------------------------------------------------------------------------
1 | What should a public relations media practitioner do if she does not know the answer to a reporter's question?,Give the reporter other information she is certain is correct.,Say that the information is 'off the record' and will be disseminated later.,Say 'I don't know' and promise to provide the information later.,"Say 'no comment,' rather than appear uninformed.",C
2 | "In issues management, what is the most proactive approach to addressing negative or misleading information posted online about your organization?",Buy domain names that could be used by opposition groups.,Post anonymous comments on blogs to combat this information.,Prepare a news release that discredits the inaccurate information.,Make policy changes to address complaints highlighted on these sites.,D
3 | Which of these statements is true of the Vatican in 2010 at the time of the accusations of child abuse cover-ups?,There was a coordinated media response.,Consistent messages were communicated.,Criticisms were taken as attacks on the Catholic Church.,The credibility of the Vatican was upheld.,C
4 | At which stage in the planning process would a situation analysis be carried out?,Defining the program,Planning the program,Taking action and implementing ideas,Evaluation of the program,A
5 | Earth Hour was a campaign launched by which organization?,Greenpeace,The UN,Oxfam,World Wildlife Fund,D
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/security_studies_dev.csv:
--------------------------------------------------------------------------------
1 | What distinguishes coercive diplomacy from military force?,"Compellence is another term for coercive diplomacy, but covering a narrower set of criteria; compellence covers those threats aimed at initiating adversary action. A threat to coerce a state to give up part of its territory would count as coercive diplomacy, as long as that threat proactively initiates action before reactive diplomacy is taken.","Coercive diplomacy constitutes the threats of limited force to induce adversary's incentive to comply with the coercer's demands. It is an influence strategy that is intended to obtain compliance: the use of force to defeat an opponent first does not count. It leaves an element of choice with the target to comply, or to continue.","Military force, or the threat of military force, utilises fear to achieve strategic objectives. Coercive diplomacy is differentiated from this approach, because it does not use fear as a tool for coercing an adversary.","Coercive diplomacy is employed to use force but to limit its effects on the international community. Coercive diplomacy is an aggressive strategy that is intended to obtain compliance through defeat. It does not leave an element of choice with the target, the target either being forced to comply or engage in conflict. It seeks to control by imposing compliance by removing any opportunity for negotiation or concession.",B
2 | Which of the following is the best lens through which to investigate the role of child soldiers?,Child soldiers are victims of combat that need re-education and rehabilitation.,Children and their mothers are not active subjects in warfare and are best considered as subjects in the private sphere.,Children are most often innocent bystanders in war and are best used as signifiers of peace.,Children have political subjecthood that is missed when they are considered as passive victims of warfare.,D
3 | "In order to become securitized, a threat must be presented in which of these ways?","As an existential threat that requires immediate and extraordinary action, posing a threat to the survival of the state or to societal security.","As requiring immediate and extraordinary action by the state, threatening the survival of a referent object and therefore warranting the use of measures not normally employed in the political realm.","As an urgent threat to the survival of the referent object, so serious that it legitimises the employment of extraordinary action in response.",As an urgent threat to the survival of the audience that requires extraordinary or emergency measures.,C
4 | How can we best describe the relationship between the state-centric approach and the concept of human security?,There are such wide divisions within the human security framework regarding the nature of threats and referent objects that no widely applicable comparisons between state-centric approaches and human security can be drawn.,"By adopting the framework of human security, the limitations of the realist state-centric approach become evident. Whilst human security defines the referent object as the person or population, state-centric approaches prioritise the security of the state, de-prioritizing the pursuit of human security.","The state-centric approach to security is a faction of human security, usually defined within the broad school of human security. By being state-centric this approach prioritises the individual as the referent object in security studies.","Both the state-centric and human-centric approaches to security are mutually exclusive and offer a sufficient analytic framework with which to understand the international security system. It is therefore the role of security analysts to determine which of these substantial concepts is correct, and which should be discarded.",B
5 | What are the frameworks of analysis within which terrorism has been considered (as of 2020)?,"Competition between larger nations has resulted in some countries actively supporting terrorist groups to undermine the strength of rival states. Terrorist networks are extended patronage clubs maintained and paid for by their donor states and are conceptualised as being like state actors, to be dealt with using military force.","Globalization has enabled the internationalization of terrorist activities by opening up their operational space, although coordination is still managed from a geographical base. This suggests that terrorist groups are nationally structured which means that terrorism cannot be considered in terms of a war to be defeated militarily without having serious implications on the indigenous population.","Terrorism can be viewed as a problem to be resolved by military means (war on terrorism), by normal police techniques (terrorism as crime), or as a medical problem with underlying causes and symptoms (terrorism as disease).","Terrorism is viewed as a criminal problem. The criminalization of terrorism has two important implications. Firstly, it suggests that terrorism can be eradicated - terrorists can be caught and brought to trial by normal judicial proceedings thereby removing the threat from society - and secondly, it suggests that preventative crime techniques are applicable to prevent its development.",C


--------------------------------------------------------------------------------
/eval/mmlu/dev/sociology_dev.csv:
--------------------------------------------------------------------------------
1 | Which of the following did the post-war welfare state of 1948 not aim to provide:,free health care and education for all,a minimum wage,full employment,universal welfare,B
2 | What does Berger (1963) describe as a metaphor for social reality?,a fairground ride,a circus,a puppet theatre,a ballet,C
3 | The shift from 'civil religion' to 'common religion' means that:,the increasing bureaucracy of the state has made religion only a marginal part of our lives,"despite the weakening of traditional authority, our everyday lives and 'common sense' remain shaped by religious beliefs and values","religious participation in collective worship may have declined, but people still practise their faiths in private","people are much more likely to discuss their religious beliefs in public, informal settings",B
4 | The term 'hegemony' refers to:,the tendency for the working class not to realize their own interests,"a dominant ideology that legitimates economic, political and cultural power",a form of dual consciousness based on ideology and everyday experiences,a mode of payment given for outstanding topiary,B
5 | Which of the following is not a problem associated with official statistics on strike action?,most strikes go unnoticed by employers and the mass media,not all industrial disputes will be reported by the employer,the definition of strikes excludes those that involve fewer than ten workers or last less than one day,it is hard to compare strikes that were measured in different ways,A
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/us_foreign_policy_dev.csv:
--------------------------------------------------------------------------------
1 | How did the 2008 financial crisis affect America's international reputation?,It damaged support for the US model of political economy and capitalism,It created anger at the United States for exaggerating the crisis,It increased support for American global leadership under President Obama,It reduced global use of the US dollar,A
2 | How did NSC-68 change U.S. strategy?,It globalized containment.,It militarized containment.,It called for the development of the hydrogen bomb.,All of the above,D
3 | The realm of policy decisions concerned primarily with relations between the United States and the rest of the world is known as,terrorism policy.,economic policy.,foreign policy.,international policy.,C
4 | How do Defensive Realism and Offensive Realism differ in their explanation of state behaviour?,Defensive realists place greater emphasis on the role of international institutions,Defensive realists place less emphasis on geographical factors,Offensive realists give more priority to the national interest than Defensive realists.,"Defensive realists believe states are security maximizers, while Offensive realists believe states to be power maximizers",D
5 | How did Donald Trump attack globalization in the 2016 campaign?,Globalization had made men like him too rich,"Globalization only benefited certain American states, such as New York","Liberal elites had encouraged globalization, while 'ordinary Americans' lost jobs because of it",Globalization encouraged damaging trade wars,C
6 | 


--------------------------------------------------------------------------------
/eval/mmlu/dev/virology_dev.csv:
--------------------------------------------------------------------------------
1 | Why are parvoviruses a highly impactful parasite?,Because they have no nucleic acid,They require a helper virus,Only replicate in dividing cells,Can integrate into host chromosomes,A
2 | Which of the following is a morphological characteristic of the paramyxoviruses.,Fragile viruses often visualised with RNA spewing from the inside,Elongate viruses,Icosahedral viruses with envelope,Very large viruses,A
3 | A key factor facilitating the application of nested case-control studies from the MACS was:,Data collection,Establishment of a repository of biologic specimens,Participant interest,Administration of the questionnaire by staff,B
4 | The most important goal of a behavioral intervention is:,Change in behavior,Comprehensive coverage,Effective use of behavioral theory,Sustained behavior change,D
5 | The median survival time to AIDS and death was established by following:,Seroprevalent HIV-infected individuals,Seronegatives,Seroconverters,High-risk seronegatives,C


--------------------------------------------------------------------------------
/eval/mmlu/dev/world_religions_dev.csv:
--------------------------------------------------------------------------------
1 |  What is the sign of the covenant for Jewish males?,The rainbow,Circumcision,A son,Bar mitzvah,B
2 | What is the Second Gem in Buddhism?,The Dharma,The Sangha,The Buddha,The Bodhisattva,A
3 | " In which dynasty was the ""Mandate of Heaven"" developed to legitimatize the new rulers?",Shang,Zhou,Han,Xia,B
4 |  Which Japanese government promoted a kind of national cult based on the emperor and his associations with kami?,Honen,Tanaka,Tokugawa,Meiji,D
5 | How can the Upanishads be characterized?,Ritual texts,Philosophical texts,Hymns,Origin stories,B


--------------------------------------------------------------------------------
/eval/mmlu/evaluate_mmlu.py:
--------------------------------------------------------------------------------
  1 | # Modified from https://github.com/hendrycks/test/blob/master/evaluate.py
  2 | import openai
  3 | import argparse
  4 | import traceback
  5 | import os
  6 | import json
  7 | import numpy as np
  8 | import pandas as pd
  9 | import time
 10 | from tqdm import tqdm
 11 | # from crop import crop
 12 | from vllm import LLM, SamplingParams
 13 | import torch
 14 | 
 15 | openai.api_key = "INSERTYOURKEYHERE"
 16 | choices = ["A", "B", "C", "D"]
 17 | 
 18 | 
 19 | def softmax(x):
 20 |     z = x - max(x)
 21 |     numerator = np.exp(z)
 22 |     denominator = np.sum(numerator)
 23 |     softmax = numerator/denominator
 24 |     return softmax
 25 | 
 26 | def format_subject(subject):
 27 |     l = subject.split("_")
 28 |     s = ""
 29 |     for entry in l:
 30 |         s += " " + entry
 31 |     return s
 32 | 
 33 | def format_example(df, idx, include_answer=True):
 34 |     prompt = df.iloc[idx, 0]
 35 |     k = df.shape[1] - 2
 36 |     for j in range(k):
 37 |         prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
 38 |     prompt += "\nAnswer:"
 39 |     if include_answer:
 40 |         prompt += " {}\n\n".format(df.iloc[idx, k + 1])
 41 |     return prompt
 42 | 
 43 | def gen_prompt(train_df, subject, k=-1):
 44 |     prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
 45 |     if k == -1:
 46 |         k = train_df.shape[0]
 47 |     for i in range(k):
 48 |         prompt += format_example(train_df, i)
 49 |     return prompt
 50 | 
 51 | def generate_sample_batch(question_list):
 52 |     global loaded, llm
 53 |     if not loaded:
 54 |         llm = LLM(
 55 |             model=args.model,
 56 |             trust_remote_code=True,
 57 |             tensor_parallel_size=torch.cuda.device_count(),
 58 |         )
 59 |         loaded = True
 60 |     sampling_params = SamplingParams(max_tokens=1,
 61 |                                     logprobs=100,
 62 |                                     temperature=0,
 63 |                                     )
 64 |     outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
 65 |     completions = [output.outputs[0].text.strip() for output in outputs]
 66 |     return completions
 67 | 
 68 | 
 69 | 
 70 | def eval(args, subject, model_name, dev_df, test_df):
 71 |     cors = []
 72 |     all_probs = []
 73 |     answers = choices[:test_df.shape[1]-2]
 74 | 
 75 |     prompt_list = []
 76 |     for i in range(test_df.shape[0]):
 77 |         # get prompt and make sure it fits
 78 |         k = args.ntrain
 79 |         prompt_end = format_example(test_df, i, include_answer=False)
 80 |         train_prompt = gen_prompt(dev_df, subject, k)
 81 |         prompt = train_prompt + prompt_end
 82 |         prompt_list.append(prompt)
 83 | 
 84 |     completions = generate_sample_batch(prompt_list)
 85 | 
 86 |     for i, pred in enumerate(completions):
 87 |         label = test_df.iloc[i, test_df.shape[1]-1]
 88 |         # pred = {0: "A", 1: "B", 2: "C", 3: "D"}[c]
 89 |     #     # \u2581 is the unicode character for a space in tokenizer
 90 |     #     d = c["choices"][0]["logprobs"]["top_logprobs"][-1]
 91 |     #     new_d = {
 92 |     #         k.replace("\u2581", " "): v
 93 |     #         for k, v in d.items()
 94 |     #     }
 95 |     #     c["choices"][0]["logprobs"]["top_logprobs"][-1] = new_d
 96 | 
 97 |     #     lprobs = []
 98 |     #     for ans in answers:
 99 |     #         try:
100 |     #             lprobs.append(c["choices"][0]["logprobs"]["top_logprobs"][-1][" {}".format(ans)])
101 |     #         except:
102 |     #             print("Warning: {} not found. Artificially adding log prob of -100.".format(ans))
103 |     #             lprobs.append(-100)
104 |     #     pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(lprobs)]
105 |     #     probs = softmax(np.array(lprobs))
106 | 
107 |         cor = pred == label
108 |         cors.append(cor)
109 |         # all_probs.append(probs)
110 | 
111 |     acc = np.mean(cors)
112 |     cors = np.array(cors)
113 | 
114 |     # all_probs = np.array(all_probs)
115 |     # print("Average accuracy {:.3f} - {}".format(acc, subject))
116 | 
117 |     return cors, acc, all_probs
118 | 
119 | def main(args):
120 |     model_name = args.model
121 |     subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
122 | 
123 |     if not os.path.exists(args.save_dir):
124 |         os.mkdir(args.save_dir)
125 |     if not os.path.exists(os.path.join(args.save_dir)):
126 |         os.mkdir(os.path.join(args.save_dir))
127 | 
128 |     # print(subjects)
129 |     # print(args)
130 |     print("Evaluating {} on {} subjects".format(model_name, len(subjects)))
131 |     all_cors = []
132 | 
133 |     metrics = {
134 |         "subject_acc": {},
135 |         "overall_acc": None
136 |     }
137 |     for subject in tqdm(subjects, total=len(subjects), desc="Subjects"):
138 |         dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
139 |         test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
140 | 
141 |         cors, acc, probs = eval(args, subject, model_name, dev_df, test_df)
142 |         metrics["subject_acc"][subject] = acc
143 |         all_cors.append(cors)
144 | 
145 |         test_df["correct"] = cors
146 |         # for j in range(probs.shape[1]):
147 |         #     choice = choices[j]
148 |         #     test_df["choice{}_probs".format(choice)] = probs[:, j]
149 |         test_df.to_csv(os.path.join(args.save_dir, "{}.csv".format(subject)), index=None)
150 | 
151 |     weighted_acc = np.mean(np.concatenate(all_cors))
152 |     metrics["overall_acc"] = weighted_acc * 100
153 |     print("Average accuracy: {:.3f}".format(weighted_acc * 100))
154 |     with open(os.path.join(args.save_dir, "results.json"), "w") as f:
155 |         json.dump(metrics, f)
156 | 
157 | if __name__ == "__main__":
158 |     parser = argparse.ArgumentParser()
159 |     parser.add_argument("--ntrain", "-k", type=int, default=5)
160 |     parser.add_argument("--data_dir", "-d", type=str, default="/mnt/data/user/tc_agi/ylf/eval_data/mmlu")
161 |     parser.add_argument("--save_dir", "-s", type=str, default="/data/results/mmlu")
162 |     parser.add_argument("--model", type=str, default="/mnt/data/user/tc_agi/user/lwd/mistral-7b/mistral-7b")
163 |     args = parser.parse_args()
164 |     loaded = False
165 |     llm = None
166 |     main(args)
167 | 
168 | 


--------------------------------------------------------------------------------
/eval/requirements.txt:
--------------------------------------------------------------------------------
 1 | ### basic requirements
 2 | openai==0.28
 3 | backoff
 4 | scipy
 5 | nltk
 6 | sympy==1.12
 7 | antlr4-python3-runtime==4.11.0
 8 | jsonlines
 9 | func_timeout
10 | langdetect
11 | immutabledict
12 | fschat
13 | vllm==0.2.7
14 | datasets
15 | transformers==4.39.1
16 | thefuzz
17 | absl-py
18 | fire
19 | accelerate
20 | 


--------------------------------------------------------------------------------
/eval/run.sh:
--------------------------------------------------------------------------------
  1 | MODEL_DIR=$1
  2 | if [ -z "$MODEL_DIR" ]; then
  3 |     MODEL_DIR="openbmb/Eurus-7b-sft"
  4 | fi
  5 | MODEL_NAME=$(basename "$MODEL_DIR")
  6 | OUTPUT_DIR="./results/$MODEL_NAME" 
  7 | mkdir -p $OUTPUT_DIR
  8 | mkdir -p cache
  9 | 
 10 | 
 11 | 
 12 | # coding_human_eval
 13 | echo "running human_eval evaluation"
 14 | mkdir -p $OUTPUT_DIR/human_eval
 15 | cd Coding/human_eval
 16 | python3 evaluate_human_eval.py \
 17 |   --model $MODEL_DIR \
 18 |   --save_dir $OUTPUT_DIR/human_eval/ \
 19 |   --num-samples-per-task 1 \
 20 |   --model_type mistral \
 21 |   --temperature 0.2
 22 | cd ../..
 23 | 
 24 | 
 25 | # coding_leetcode
 26 | echo "running leetcode evaluation"
 27 | mkdir -p $OUTPUT_DIR/leetcode
 28 | cd Coding/leetcode
 29 | python3 evaluate_leetcode.py \
 30 |   --model $MODEL_DIR \
 31 |   --save_dir $OUTPUT_DIR/leetcode/ \
 32 |   --num-samples-per-task 1 \
 33 |   --model_type mistral \
 34 |   --temperature 0.
 35 | 
 36 | python3 test.py \
 37 |   --generation_path $OUTPUT_DIR/leetcode/ \
 38 |   --result_path $OUTPUT_DIR/leetcode/ \
 39 |   --temp_dir output/temp
 40 | cd ../..
 41 | 
 42 | # coding_mbpp
 43 | echo "running mbpp evaluation"
 44 | mkdir -p $OUTPUT_DIR/mbpp
 45 | cd Coding/mbpp
 46 | python3 evaluate_mbpp.py \
 47 |   --model $MODEL_DIR \
 48 |   --input_data 	new_mbpp.json \
 49 |   --save_dir $OUTPUT_DIR/mbpp/ \
 50 |   --model_type mistral 
 51 | cd ../..
 52 | 
 53 | 
 54 | # math_math
 55 | echo "running math-cot evaluation"
 56 | mkdir -p $OUTPUT_DIR/math_cot
 57 | cd Math/math
 58 | python3 evaluate_math_cot.py \
 59 |   --data_dir ./ \
 60 |   --save_dir $OUTPUT_DIR/math_cot/ \
 61 |   --model_type mistral \
 62 |   --model $MODEL_DIR 
 63 | cd ../..
 64 | 
 65 | echo "running math-pot evaluation"
 66 | mkdir -p $OUTPUT_DIR/math_pot
 67 | cd Math/math
 68 | mkdir -p cache
 69 | python3 evaluate_math_pot.py \
 70 |   --data_dir ./ \
 71 |   --save_dir $OUTPUT_DIR/math_pot/ \
 72 |   --model_type mistral \
 73 |   --model $MODEL_DIR 
 74 | cd ../..
 75 | 
 76 | # math_asdiv_gsmplus_svamp
 77 | echo "running asdiv&gsmplus&svamp cot evaluation"
 78 | mkdir -p $OUTPUT_DIR/subset_cot
 79 | cd Math/subset
 80 | python3 evaluate_subset_cot.py \
 81 |   --data_dir ./data \
 82 |   --save_dir $OUTPUT_DIR/subset_cot/ \
 83 |   --model_type mistral \
 84 |   --model $MODEL_DIR 
 85 | cd ../..
 86 | 
 87 | echo "running asdiv&gsmplus&svamp pot evaluation"
 88 | mkdir -p $OUTPUT_DIR/subset_pot
 89 | cd Math/subset
 90 | mkdir -p cache
 91 | python3 evaluate_subset_pot.py \
 92 |   --data_dir ./data \
 93 |   --save_dir $OUTPUT_DIR/subset_pot/ \
 94 |   --model_type mistral \
 95 |   --model $MODEL_DIR 
 96 | cd ../..
 97 | 
 98 | # math_theorem_qa
 99 | echo "running theorem-qa cot evaluation"
100 | mkdir -p $OUTPUT_DIR/theorem_qa_cot
101 | cd Math/theorem_qa
102 | python3 evaluate_theorem_qa_cot.py \
103 |   --model $MODEL_DIR \
104 |   --input_data 	./theorem_qa.json \
105 |   --model_type mistral \
106 |   --save_dir $OUTPUT_DIR/theorem_qa_cot/
107 | cd ../..
108 | 
109 | echo "running theorem-qa pot evaluation"
110 | mkdir -p $OUTPUT_DIR/theorem_qa_pot
111 | cd Math/theorem_qa
112 | mkdir -p cache
113 | python3 evaluate_theorem_qa_pot.py \
114 |   --model $MODEL_DIR \
115 |   --input_data 	./theorem_qa.json \
116 |   --model_type mistral \
117 |   --save_dir $OUTPUT_DIR/theorem_qa_pot/
118 | cd ../..
119 | 
120 | # reasoning_bbh cot
121 | echo "running bbh evaluation"
122 | mkdir -p $OUTPUT_DIR/bbh
123 | cd Reasoning/bbh
124 | python3 evaluate_bbh.py \
125 |   --model $MODEL_DIR \
126 |   --data_filepath ./test_prompts.json \
127 |   --output_filepath $OUTPUT_DIR/bbh/res.jsonl \
128 |   --model_type mistral \
129 |   --n_processes 8 \
130 |   --is_cot
131 | cd ../..
132 | 
133 | # ins-Following-if_eval
134 | echo "running if-eval evaluation"
135 | mkdir -p $OUTPUT_DIR/if_eval
136 | cd Ins-Following/if_eval
137 | python3 evaluate_if_eval.py \
138 |   --model $MODEL_DIR \
139 |   --input_data ./input_data.jsonl \
140 |   --save_path $OUTPUT_DIR/if_eval/input_response_data.jsonl \
141 |   --model_type mistral 
142 | 
143 | python3 evaluation_main.py \
144 |   --input_data ./input_data.jsonl \
145 |   --input_response_data $OUTPUT_DIR/if_eval/input_response_data.jsonl \
146 |   --output_dir $OUTPUT_DIR/if_eval/
147 | cd ../..
148 | 
149 | #mmlu
150 | echo "running mmlu evaluation"
151 | mkdir -p $OUTPUT_DIR/mmlu
152 | cd mmlu/
153 | python3 -u evaluate_mmlu.py \
154 |     --model $MODEL_DIR \
155 |     --data_dir ./ \
156 |     --save_dir $OUTPUT_DIR/mmlu
157 | 
158 | 
159 | 
160 | echo "All Evals Finished!"


--------------------------------------------------------------------------------
/eval/utils/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | 
 7 | ROOT = os.path.dirname(os.path.abspath(__file__))
 8 | HUMAN_EVAL = os.path.join(ROOT, "..", "Coding","human_eval","data", "HumanEval.jsonl.gz")
 9 | 
10 | 
11 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
12 |     return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
13 | 
14 | 
15 | def stream_jsonl(filename: str) -> Iterable[Dict]:
16 |     """
17 |     Parses each jsonl line and yields it as a dictionary
18 |     """
19 |     if filename.endswith(".gz"):
20 |         with open(filename, "rb") as gzfp:
21 |             with gzip.open(gzfp, 'rt') as fp:
22 |                 for line in fp:
23 |                     if any(not x.isspace() for x in line):
24 |                         yield json.loads(line)
25 |     else:
26 |         with open(filename, "r") as fp:
27 |             for line in fp:
28 |                 if any(not x.isspace() for x in line):
29 |                     yield json.loads(line)
30 | 
31 | 
32 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
33 |     """
34 |     Writes an iterable of dictionaries to jsonl
35 |     """
36 |     if append:
37 |         mode = 'ab'
38 |     else:
39 |         mode = 'wb'
40 |     filename = os.path.expanduser(filename)
41 |     if filename.endswith(".gz"):
42 |         with open(filename, mode) as fp:
43 |             with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
44 |                 for x in data:
45 |                     gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46 |     else:
47 |         with open(filename, mode) as fp:
48 |             for x in data:
49 |                 fp.write((json.dumps(x) + "\n").encode('utf-8'))
50 | 


--------------------------------------------------------------------------------
/eval/utils/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | from concurrent.futures import ThreadPoolExecutor, as_completed
  3 | from typing import List, Union, Iterable, Dict
  4 | import itertools
  5 | 
  6 | import numpy as np
  7 | import tqdm
  8 | 
  9 | from utils.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl
 10 | from utils.execution import check_correctness
 11 | 
 12 | 
 13 | def estimate_pass_at_k(
 14 |     num_samples: Union[int, List[int], np.ndarray],
 15 |     num_correct: Union[List[int], np.ndarray],
 16 |     k: int
 17 | ) -> np.ndarray:
 18 |     """
 19 |     Estimates pass@k of each problem and returns them in an array.
 20 |     """
 21 | 
 22 |     def estimator(n: int, c: int, k: int) -> float:
 23 |         """
 24 |         Calculates 1 - comb(n - c, k) / comb(n, k).
 25 |         """
 26 |         if n - c < k:
 27 |             return 1.0
 28 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 29 | 
 30 |     if isinstance(num_samples, int):
 31 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
 32 |     else:
 33 |         assert len(num_samples) == len(num_correct)
 34 |         num_samples_it = iter(num_samples)
 35 | 
 36 |     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
 37 | 
 38 | 
 39 | def evaluate_functional_correctness(
 40 |     sample_file: str,
 41 |     k: List[int] = [1, 10, 100],
 42 |     n_workers: int = 4,
 43 |     timeout: float = 3.0,
 44 |     problem_file: str = HUMAN_EVAL,
 45 | ):
 46 |     """
 47 |     Evaluates the functional correctness of generated samples, and writes
 48 |     results to f"{sample_file}_results.jsonl.gz"
 49 |     """
 50 | 
 51 |     problems = read_problems(problem_file)
 52 | 
 53 |     # Check the generated samples against test suites.
 54 |     with ThreadPoolExecutor(max_workers=n_workers) as executor:
 55 | 
 56 |         futures = []
 57 |         completion_id = Counter()
 58 |         n_samples = 0
 59 |         results = defaultdict(list)
 60 | 
 61 |         print("Reading samples...")
 62 |         for sample in tqdm.tqdm(stream_jsonl(sample_file)):
 63 |             task_id = sample["task_id"]
 64 |             completion = sample["completion"]
 65 |             args = (problems[task_id], completion, timeout, completion_id[task_id])
 66 |             future = executor.submit(check_correctness, *args)
 67 |             futures.append(future)
 68 |             completion_id[task_id] += 1
 69 |             n_samples += 1
 70 | 
 71 |         assert len(completion_id) == len(problems), "Some problems are not attempted."
 72 | 
 73 |         print("Running test suites...")
 74 |         for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
 75 |             result = future.result()
 76 |             results[result["task_id"]].append((result["completion_id"], result))
 77 | 
 78 |     # Calculate pass@k.
 79 |     total, correct = [], []
 80 |     for result in results.values():
 81 |         result.sort()
 82 |         passed = [r[1]["passed"] for r in result]
 83 |         total.append(len(passed))
 84 |         correct.append(sum(passed))
 85 |     total = np.array(total)
 86 |     correct = np.array(correct)
 87 | 
 88 |     ks = k
 89 |     pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
 90 |                  for k in ks if (total >= k).all()}
 91 | 
 92 |     # Finally, save the results in one file:
 93 |     def combine_results():
 94 |         for sample in stream_jsonl(sample_file):
 95 |             task_id = sample["task_id"]
 96 |             result = results[task_id].pop(0)
 97 |             sample["result"] = result[1]["result"]
 98 |             sample["passed"] = result[1]["passed"]
 99 |             yield sample
100 | 
101 |     out_file = sample_file + "_results.jsonl"
102 |     print(f"Writing results to {out_file}...")
103 |     write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
104 |     print(pass_at_k)
105 | 
106 |     return pass_at_k
107 | 


--------------------------------------------------------------------------------
/eval/utils/execution.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Callable, Dict
  2 | import ast
  3 | import contextlib
  4 | import faulthandler
  5 | import io
  6 | import os
  7 | import multiprocessing
  8 | import platform
  9 | import signal
 10 | import tempfile
 11 | 
 12 | os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
 13 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 14 | 
 15 | def check_correctness(problem: Dict, completion: str, timeout: float,
 16 |                       completion_id: Optional[int] = None) -> Dict:
 17 |     """
 18 |     Evaluates the functional correctness of a completion by running the test
 19 |     suite provided in the problem. 
 20 | 
 21 |     :param completion_id: an optional completion ID so we can match
 22 |         the results later even if execution finishes asynchronously.
 23 |     """
 24 | 
 25 |     def unsafe_execute():
 26 | 
 27 |         with create_tempdir():
 28 | 
 29 |             # These system calls are needed when cleaning up tempdir.
 30 |             import os
 31 |             import shutil
 32 |             rmtree = shutil.rmtree
 33 |             rmdir = os.rmdir
 34 |             chdir = os.chdir
 35 | 
 36 |             # Disable functionalities that can make destructive changes to the test.
 37 |             reliability_guard()
 38 | 
 39 |             # Construct the check program and run it.
 40 |             check_program = (
 41 |                 problem["prompt"] + completion + "\n" +
 42 |                 problem["test"] + "\n" +
 43 |                 f"check({problem['entry_point']})"
 44 |             )
 45 | 
 46 |             try:
 47 |                 exec_globals = {}
 48 |                 with swallow_io():
 49 |                     with time_limit(timeout):
 50 | # WARNING
 51 | # This program exists to execute untrusted model-generated code. Although
 52 | # it is highly unlikely that model-generated code will do something overtly
 53 | # malicious in response to this test suite, model-generated code may act
 54 | # destructively due to a lack of model capability or alignment.
 55 | # Users are strongly encouraged to sandbox this evaluation suite so that it 
 56 | # does not perform destructive actions on their host or network. For more 
 57 | # information on how OpenAI sandboxes its code, see the accompanying paper.
 58 | # Once you have read this disclaimer and taken appropriate precautions, 
 59 | # uncomment the following line and proceed at your own risk:
 60 |                         exec(check_program, exec_globals)
 61 |                 result.append("passed")
 62 |             except TimeoutException:
 63 |                 result.append("timed out")
 64 |             except BaseException as e:
 65 |                 result.append(f"failed: {e}")
 66 | 
 67 |             # Needed for cleaning up.
 68 |             shutil.rmtree = rmtree
 69 |             os.rmdir = rmdir
 70 |             os.chdir = chdir
 71 | 
 72 |     manager = multiprocessing.Manager()
 73 |     result = manager.list()
 74 | 
 75 |     p = multiprocessing.Process(target=unsafe_execute)
 76 |     p.start()
 77 |     p.join(timeout=timeout + 1)
 78 |     if p.is_alive():
 79 |         p.kill()
 80 | 
 81 |     if not result:
 82 |         result.append("timed out")
 83 | 
 84 |     return dict(
 85 |         task_id=problem["task_id"],
 86 |         passed=result[0] == "passed",
 87 |         result=result[0],
 88 |         completion_id=completion_id,
 89 |     )
 90 | 
 91 | 
 92 | @contextlib.contextmanager
 93 | def time_limit(seconds: float):
 94 |     def signal_handler(signum, frame):
 95 |         raise TimeoutException("Timed out!")
 96 |     signal.setitimer(signal.ITIMER_REAL, seconds)
 97 |     signal.signal(signal.SIGALRM, signal_handler)
 98 |     try:
 99 |         yield
100 |     finally:
101 |         signal.setitimer(signal.ITIMER_REAL, 0)
102 | 
103 | 
104 | @contextlib.contextmanager
105 | def swallow_io():
106 |     stream = WriteOnlyStringIO()
107 |     with contextlib.redirect_stdout(stream):
108 |         with contextlib.redirect_stderr(stream):
109 |             with redirect_stdin(stream):
110 |                 yield
111 | 
112 | 
113 | @contextlib.contextmanager
114 | def create_tempdir():
115 |     with tempfile.TemporaryDirectory() as dirname:
116 |         with chdir(dirname):
117 |             yield dirname
118 | 
119 | 
120 | class TimeoutException(Exception):
121 |     pass
122 | 
123 | 
124 | class WriteOnlyStringIO(io.StringIO):
125 |     """ StringIO that throws an exception when it's read from """
126 | 
127 |     def read(self, *args, **kwargs):
128 |         raise IOError
129 | 
130 |     def readline(self, *args, **kwargs):
131 |         raise IOError
132 | 
133 |     def readlines(self, *args, **kwargs):
134 |         raise IOError
135 | 
136 |     def readable(self, *args, **kwargs):
137 |         """ Returns True if the IO object can be read. """
138 |         return False
139 | 
140 | 
141 | class redirect_stdin(contextlib._RedirectStream):  # type: ignore
142 |     _stream = 'stdin'
143 | 
144 | 
145 | @contextlib.contextmanager
146 | def chdir(root):
147 |     if root == ".":
148 |         yield
149 |         return
150 |     cwd = os.getcwd()
151 |     os.chdir(root)
152 |     try:
153 |         yield
154 |     except BaseException as exc:
155 |         raise exc
156 |     finally:
157 |         os.chdir(cwd)
158 | 
159 | 
160 | def reliability_guard(maximum_memory_bytes: Optional[int] = None):
161 |     """
162 |     This disables various destructive functions and prevents the generated code
163 |     from interfering with the test (e.g. fork bomb, killing other processes,
164 |     removing filesystem files, etc.)
165 | 
166 |     WARNING
167 |     This function is NOT a security sandbox. Untrusted code, including, model-
168 |     generated code, should not be blindly executed outside of one. See the 
169 |     Codex paper for more information about OpenAI's code sandbox, and proceed
170 |     with caution.
171 |     """
172 | 
173 |     if maximum_memory_bytes is not None:
174 |         import resource
175 |         resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
176 |         resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
177 |         if not platform.uname().system == 'Darwin':
178 |             resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
179 | 
180 |     faulthandler.disable()
181 | 
182 |     import builtins
183 |     builtins.exit = None
184 |     builtins.quit = None
185 | 
186 |     import os
187 |     os.environ['OMP_NUM_THREADS'] = '1'
188 | 
189 |     os.kill = None
190 |     os.system = None
191 |     os.putenv = None
192 |     os.remove = None
193 |     os.removedirs = None
194 |     os.rmdir = None
195 |     os.fchdir = None
196 |     os.setuid = None
197 |     os.fork = None
198 |     os.forkpty = None
199 |     os.killpg = None
200 |     os.rename = None
201 |     os.renames = None
202 |     os.truncate = None
203 |     os.replace = None
204 |     os.unlink = None
205 |     os.fchmod = None
206 |     os.fchown = None
207 |     os.chmod = None
208 |     os.chown = None
209 |     os.chroot = None
210 |     os.fchdir = None
211 |     os.lchflags = None
212 |     os.lchmod = None
213 |     os.lchown = None
214 |     os.getcwd = None
215 |     os.chdir = None
216 | 
217 |     import shutil
218 |     shutil.rmtree = None
219 |     shutil.move = None
220 |     shutil.chown = None
221 | 
222 |     import subprocess
223 |     subprocess.Popen = None  # type: ignore
224 | 
225 |     __builtins__['help'] = None
226 | 
227 |     import sys
228 |     sys.modules['ipdb'] = None
229 |     sys.modules['joblib'] = None
230 |     sys.modules['resource'] = None
231 |     sys.modules['psutil'] = None
232 |     sys.modules['tkinter'] = None
233 | 


--------------------------------------------------------------------------------
/eval/utils/math_equivalence.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py
  2 | 
  3 | def _fix_fracs(string):
  4 |     substrs = string.split("\\frac")
  5 |     new_str = substrs[0]
  6 |     if len(substrs) > 1:
  7 |         substrs = substrs[1:]
  8 |         for substr in substrs:
  9 |             new_str += "\\frac"
 10 |             if substr[0] == "{":
 11 |                 new_str += substr
 12 |             else:
 13 |                 try:
 14 |                     assert len(substr) >= 2
 15 |                 except:
 16 |                     return string
 17 |                 a = substr[0]
 18 |                 b = substr[1]
 19 |                 if b != "{":
 20 |                     if len(substr) > 2:
 21 |                         post_substr = substr[2:]
 22 |                         new_str += "{" + a + "}{" + b + "}" + post_substr
 23 |                     else:
 24 |                         new_str += "{" + a + "}{" + b + "}"
 25 |                 else:
 26 |                     if len(substr) > 2:
 27 |                         post_substr = substr[2:]
 28 |                         new_str += "{" + a + "}" + b + post_substr
 29 |                     else:
 30 |                         new_str += "{" + a + "}" + b
 31 |     string = new_str
 32 |     return string
 33 | 
 34 | def _fix_a_slash_b(string):
 35 |     if len(string.split("/")) != 2:
 36 |         return string
 37 |     a = string.split("/")[0]
 38 |     b = string.split("/")[1]
 39 |     try:
 40 |         a = int(a)
 41 |         b = int(b)
 42 |         assert string == "{}/{}".format(a, b)
 43 |         new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
 44 |         return new_string
 45 |     except:
 46 |         return string
 47 | 
 48 | def _remove_right_units(string):
 49 |     # "\\text{ " only ever occurs (at least in the val set) when describing units
 50 |     if "\\text{ " in string:
 51 |         splits = string.split("\\text{ ")
 52 |         assert len(splits) == 2
 53 |         return splits[0]
 54 |     else:
 55 |         return string
 56 | 
 57 | def _fix_sqrt(string):
 58 |     if "\\sqrt" not in string:
 59 |         return string
 60 |     splits = string.split("\\sqrt")
 61 |     new_string = splits[0] 
 62 |     for split in splits[1:]:
 63 |         if split[0] != "{":
 64 |             a = split[0]
 65 |             new_substr = "\\sqrt{" + a + "}" + split[1:]
 66 |         else:
 67 |             new_substr = "\\sqrt" + split
 68 |         new_string += new_substr
 69 |     return new_string
 70 | 
 71 | def _strip_string(string):
 72 |     # linebreaks  
 73 |     string = string.replace("\n", "")
 74 |     #print(string)
 75 | 
 76 |     # remove inverse spaces
 77 |     string = string.replace("\\!", "")
 78 |     #print(string)
 79 | 
 80 |     # replace \\ with \
 81 |     string = string.replace("\\\\", "\\")
 82 |     #print(string)
 83 | 
 84 |     # replace tfrac and dfrac with frac
 85 |     string = string.replace("tfrac", "frac")
 86 |     string = string.replace("dfrac", "frac")
 87 |     #print(string)
 88 | 
 89 |     # remove \left and \right
 90 |     string = string.replace("\\left", "")
 91 |     string = string.replace("\\right", "")
 92 |     #print(string)
 93 |     
 94 |     # Remove circ (degrees)
 95 |     string = string.replace("^{\\circ}", "")
 96 |     string = string.replace("^\\circ", "")
 97 | 
 98 |     # remove dollar signs
 99 |     string = string.replace("\\$", "")
100 |     
101 |     # remove units (on the right)
102 |     string = _remove_right_units(string)
103 | 
104 |     # remove percentage
105 |     string = string.replace("\\%", "")
106 |     string = string.replace("\%", "")
107 | 
108 |     # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
109 |     string = string.replace(" .", " 0.")
110 |     string = string.replace("{.", "{0.")
111 |     # if empty, return empty string
112 |     if len(string) == 0:
113 |         return string
114 |     if string[0] == ".":
115 |         string = "0" + string
116 | 
117 |     # to consider: get rid of e.g. "k = " or "q = " at beginning
118 |     if len(string.split("=")) == 2:
119 |         if len(string.split("=")[0]) <= 2:
120 |             string = string.split("=")[1]
121 | 
122 |     # fix sqrt3 --> sqrt{3}
123 |     string = _fix_sqrt(string)
124 | 
125 |     # remove spaces
126 |     string = string.replace(" ", "")
127 | 
128 |     # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
129 |     string = _fix_fracs(string)
130 | 
131 |     # manually change 0.5 --> \frac{1}{2}
132 |     if string == "0.5":
133 |         string = "\\frac{1}{2}"
134 | 
135 |     # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
136 |     string = _fix_a_slash_b(string)
137 | 
138 |     return string
139 | 
140 | def is_equiv(str1, str2, verbose=False):
141 |     if str1 is None and str2 is None:
142 |         print("WARNING: Both None")
143 |         return True
144 |     if str1 is None or str2 is None:
145 |         return False
146 | 
147 |     try:
148 |         ss1 = _strip_string(str1)
149 |         ss2 = _strip_string(str2)
150 |         if verbose:
151 |             print(ss1, ss2)
152 |         return ss1 == ss2
153 |     except:
154 |         return str1 == str2
155 | 


--------------------------------------------------------------------------------
/eval/utils/math_normalize.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This logic is largely copied from the Hendrycks' MATH release (math_equivalence).
  3 | 
  4 | From: https://github.com/openai/prm800k/blob/main/prm800k/grading/math_normalize.py
  5 | """
  6 | import re
  7 | from typing import Optional
  8 | 
  9 | 
 10 | def normalize_answer(answer: Optional[str]) -> Optional[str]:
 11 |     if answer is None:
 12 |         return None
 13 |     answer = answer.strip()
 14 |     try:
 15 |         # Remove enclosing `\text{}`.
 16 |         m = re.search("^\\\\text\{(?P<text>.+?)\}$", answer)
 17 |         if m is not None:
 18 |             answer = m.group("text").strip()
 19 |         return _strip_string(answer)
 20 |     except:
 21 |         return answer
 22 | 
 23 | 
 24 | def _fix_fracs(string):
 25 |     substrs = string.split("\\frac")
 26 |     new_str = substrs[0]
 27 |     if len(substrs) > 1:
 28 |         substrs = substrs[1:]
 29 |         for substr in substrs:
 30 |             new_str += "\\frac"
 31 |             if substr[0] == "{":
 32 |                 new_str += substr
 33 |             else:
 34 |                 try:
 35 |                     assert len(substr) >= 2
 36 |                 except:
 37 |                     return string
 38 |                 a = substr[0]
 39 |                 b = substr[1]
 40 |                 if b != "{":
 41 |                     if len(substr) > 2:
 42 |                         post_substr = substr[2:]
 43 |                         new_str += "{" + a + "}{" + b + "}" + post_substr
 44 |                     else:
 45 |                         new_str += "{" + a + "}{" + b + "}"
 46 |                 else:
 47 |                     if len(substr) > 2:
 48 |                         post_substr = substr[2:]
 49 |                         new_str += "{" + a + "}" + b + post_substr
 50 |                     else:
 51 |                         new_str += "{" + a + "}" + b
 52 |     string = new_str
 53 |     return string
 54 | 
 55 | 
 56 | def _fix_a_slash_b(string):
 57 |     if len(string.split("/")) != 2:
 58 |         return string
 59 |     a = string.split("/")[0]
 60 |     b = string.split("/")[1]
 61 |     try:
 62 |         a = int(a)
 63 |         b = int(b)
 64 |         assert string == "{}/{}".format(a, b)
 65 |         new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
 66 |         return new_string
 67 |     except:
 68 |         return string
 69 | 
 70 | 
 71 | def _remove_right_units(string):
 72 |     # "\\text{ " only ever occurs (at least in the val set) when describing units
 73 |     if "\\text{ " in string:
 74 |         splits = string.split("\\text{ ")
 75 |         assert len(splits) == 2
 76 |         return splits[0]
 77 |     else:
 78 |         return string
 79 | 
 80 | 
 81 | def _fix_sqrt(string):
 82 |     if "\\sqrt" not in string:
 83 |         return string
 84 |     splits = string.split("\\sqrt")
 85 |     new_string = splits[0]
 86 |     for split in splits[1:]:
 87 |         if split[0] != "{":
 88 |             a = split[0]
 89 |             new_substr = "\\sqrt{" + a + "}" + split[1:]
 90 |         else:
 91 |             new_substr = "\\sqrt" + split
 92 |         new_string += new_substr
 93 |     return new_string
 94 | 
 95 | 
 96 | def _strip_string(string):
 97 |     # linebreaks
 98 |     string = string.replace("\n", "")
 99 |     # print(string)
100 | 
101 |     # remove inverse spaces
102 |     string = string.replace("\\!", "")
103 |     # print(string)
104 | 
105 |     # replace \\ with \
106 |     string = string.replace("\\\\", "\\")
107 |     # print(string)
108 | 
109 |     # replace tfrac and dfrac with frac
110 |     string = string.replace("tfrac", "frac")
111 |     string = string.replace("dfrac", "frac")
112 |     # print(string)
113 | 
114 |     # remove \left and \right
115 |     string = string.replace("\\left", "")
116 |     string = string.replace("\\right", "")
117 |     # print(string)
118 | 
119 |     # Remove circ (degrees)
120 |     string = string.replace("^{\\circ}", "")
121 |     string = string.replace("^\\circ", "")
122 | 
123 |     # remove dollar signs
124 |     string = string.replace("\\$", "")
125 | 
126 |     # remove units (on the right)
127 |     string = _remove_right_units(string)
128 | 
129 |     # remove percentage
130 |     string = string.replace("\\%", "")
131 |     string = string.replace("\%", "")
132 | 
133 |     # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
134 |     string = string.replace(" .", " 0.")
135 |     string = string.replace("{.", "{0.")
136 |     # if empty, return empty string
137 |     if len(string) == 0:
138 |         return string
139 |     if string[0] == ".":
140 |         string = "0" + string
141 | 
142 |     # to consider: get rid of e.g. "k = " or "q = " at beginning
143 |     if len(string.split("=")) == 2:
144 |         if len(string.split("=")[0]) <= 2:
145 |             string = string.split("=")[1]
146 | 
147 |     # fix sqrt3 --> sqrt{3}
148 |     string = _fix_sqrt(string)
149 | 
150 |     # remove spaces
151 |     # string = string.replace(" ", "")
152 | 
153 |     # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
154 |     string = _fix_fracs(string)
155 | 
156 |     # manually change 0.5 --> \frac{1}{2}
157 |     if string == "0.5":
158 |         string = "\\frac{1}{2}"
159 | 
160 |     # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
161 |     string = _fix_a_slash_b(string)
162 | 
163 |     return string


--------------------------------------------------------------------------------
/eval/utils/python_interpreter.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping
  2 | import re
  3 | import signal
  4 | from contextlib import contextmanager
  5 | from typing import Any
  6 | import subprocess
  7 | from tqdm import tqdm
  8 | 
  9 | import os
 10 | class PythonREPL():
 11 |     def __init__(self, timeout=5, tmp_file="cache/tmp"):
 12 |         self.timeout = timeout
 13 |         
 14 |         import datetime
 15 |         import random
 16 |         current_time = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
 17 |         random_number = random.random()
 18 |         self.tmp_file = tmp_file + current_time + str(random_number)
 19 |         os.system(f"touch {self.tmp_file}.py" )
 20 | 
 21 |     @contextmanager
 22 |     def time_limit(self, seconds):
 23 |         def signal_handler(signum, frame):
 24 |             raise TimeoutError(f"Timed out after {seconds} seconds.")
 25 | 
 26 |         signal.signal(signal.SIGALRM, signal_handler)
 27 |         signal.alarm(seconds)
 28 |         try:
 29 |             yield
 30 |         finally:
 31 |             signal.alarm(0)  # Disable the alarm
 32 |  
 33 |     def __call__(self, query: str) -> str:
 34 |         query = query.strip().split("\n")
 35 |         if "print(" not in query[-1]:
 36 |             query[-1] = "print(" + query[-1] + ")"
 37 |         query = "\n".join(query)
 38 | 
 39 |         with open(f'{self.tmp_file}.py', "w") as f:
 40 |             f.write(query)
 41 |         
 42 |         with self.time_limit(self.timeout):
 43 |             result = subprocess.run(
 44 |                     ['python3', f'{self.tmp_file}.py'], capture_output=True, check=False, text=True, timeout=self.timeout)
 45 | 
 46 |             if result.returncode == 0:
 47 |                 output = result.stdout
 48 |                 return True, output.strip()
 49 |             else:
 50 |                 error_msg = result.stderr.strip()
 51 |                 msgs = error_msg.split("\n")
 52 |                 new_msgs = []
 53 |                 want_next = False
 54 |                 for m in msgs:
 55 |                     if "Traceback" in m:
 56 |                         new_msgs.append(m)
 57 |                     elif m == msgs[-1]:
 58 |                         new_msgs.append(m)
 59 |                     elif self.tmp_file in m:
 60 |                         st = m.index('"/') + 1 if '"/' in m else 0
 61 |                         ed = m.index(f'/{self.tmp_file}.py') + 1 if f'/{self.tmp_file}.py' in m else None
 62 |                         clr = m[st:ed] if not ed else m[st:]
 63 |                         m = m.replace(clr, "")
 64 |                         new_msgs.append(m)
 65 |                         want_next = True
 66 |                     elif want_next:
 67 |                         new_msgs.append(m)
 68 |                         want_next = False
 69 |                 error_msg = "\n".join(new_msgs)
 70 |                 return False, error_msg.strip()
 71 |         
 72 |     
 73 | def postprocess_completion(executor, completion):
 74 | 
 75 |     executions = ["!" + code for code in re.findall(r"```bash(.*?)```", completion, re.DOTALL) if "!" not in code]
 76 |     executions.extend(re.findall(r"```python(.*?)```", completion, re.DOTALL))
 77 |     executions.extend(re.findall(r"<execute>(.*?)</execute>", completion, re.DOTALL))
 78 |     
 79 |     if len(executions) == 0: # directly return cot result
 80 |         return completion
 81 |     else:
 82 |         ### Python
 83 |         execution_outputs = []
 84 |         for code in executions:
 85 |             try: 
 86 |                 success, output = executor(code)
 87 |             except TimeoutError:
 88 |                 print("time out")
 89 |                 # success = False
 90 |                 output = ""
 91 |             else:
 92 |                 output = output if success else ""
 93 |             execution_outputs.append(output)
 94 |         extracted_outputs = execution_outputs
 95 | 
 96 |         for index in range(1, len(extracted_outputs) + 1):
 97 |             extracted_solution = str(extracted_outputs[-index]).strip()
 98 |             break
 99 | 
100 |         return extracted_solution
101 | 
102 | 
103 | # def postprocess_completions(completion_list):
104 | #     executor = PythonREPL()
105 |     
106 | #     solution_list = []
107 | #     for completion in completion_list:
108 | #         solution_list.append(postprocess_completion(executor, completion))
109 | 
110 | #     del executor
111 | 
112 | #     return solution_list
113 | 
114 | 
115 | import multiprocessing
116 | from concurrent.futures import ProcessPoolExecutor, as_completed
117 | 
118 | def postprocess_completion_wrapper(completion):
119 |     executor = PythonREPL()
120 |     result = postprocess_completion(executor, completion)
121 |     os.system(f"rm {executor.tmp_file}.py")
122 |     del executor
123 |     return result
124 | 
125 | def postprocess_completions(completion_list):
126 |     with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
127 |         futures = [executor.submit(postprocess_completion_wrapper, completion) for completion in completion_list]
128 |         solution_list = [future.result() for future in as_completed(futures)]
129 |     return solution_list
130 | 
131 | if __name__ == "__main__":
132 |     code = """
133 | Step 1: First, let's calculate the total number of eggs laid by Janet's ducks in a day.
134 | Step 2: Next, let's calculate the number of eggs Janet eats for breakfast each day.
135 | Step 3: Then, let's calculate the number of eggs Janet bakes for her friends each day.
136 | Step 4: Finally, let's calculate the number of eggs Janet sells at the farmers' market each day.
137 | Step 5: To find the total amount of money Janet makes each day at the farmers' market, we can multiply the number of eggs she sells by the price per egg.
138 | ```python
139 | # Step 6: Calculate the total number of eggs laid by Janet's ducks in a day.
140 | total_eggs_per_day = 16
141 | # Step 7: Calculate the number of eggs Janet eats for breakfast each day.
142 | eggs_eaten_per_day = 3
143 | # Step 8: Calculate the number of eggs Janet bakes for her friends each day.
144 | eggs_baked_per_day = 4
145 | # Step 9: Calculate the number of eggs Janet sells at the farmers' market each day.
146 | eggs_sold_per_day = total_eggs_per_day - eggs_eaten_per_day - eggs_baked_per_day
147 | # Step 10: Calculate the total amount of money Janet makes each day at the farmers' market.
148 | price_per_egg = 2
149 | total_money_per_day = eggs_sold_per_day * price_per_egg
150 | total_money_per_day
151 | ```
152 | Answer:
153 | 12
154 | 
155 | """
156 |     import pandas as pd
157 |     import json
158 |     code_list = pd.read_json("../Math/math/output_llama3-8b-new-mix.json").to_dict("records")
159 |     completions = [code["completions"] for code in code_list]
160 |     processed_completions = postprocess_completions(completions[:10])
161 |     print(completions[:10])
162 |     print(processed_completions[:10])


--------------------------------------------------------------------------------
/eval/utils/util.py:
--------------------------------------------------------------------------------
  1 | # Adapt from https://github.com/hendrycks/math/blob/main/modeling/dataset/util.py
  2 | 
  3 | import pprint
  4 | 
  5 | def last_boxed_only(sample):
  6 |     """
  7 |     Given a (q,a) sample, filter the answers so that they only contain 
  8 |     the last \boxed{...} or \fbox{...} element
  9 |     """
 10 |     q, a = sample
 11 |     a = last_boxed_only_string(a)
 12 |     if a == None:
 13 |         return None
 14 |     return (q, a)
 15 | 
 16 | def last_boxed_only_string(string):
 17 |     idx = string.rfind("\\boxed")
 18 |     if idx < 0:
 19 |         idx = string.rfind("\\fbox")
 20 |         if idx < 0:
 21 |             return None
 22 | 
 23 |     i = idx
 24 |     right_brace_idx = None
 25 |     num_left_braces_open = 0
 26 |     while i < len(string):
 27 |         if string[i] == "{":
 28 |             num_left_braces_open += 1
 29 |         if string[i] == "}":
 30 |             num_left_braces_open -= 1
 31 |             if num_left_braces_open == 0:
 32 |                 right_brace_idx = i
 33 |                 break
 34 |         i += 1
 35 |     
 36 |     if right_brace_idx == None:
 37 |         retval = None
 38 |     else:
 39 |         retval = string[idx:right_brace_idx + 1]
 40 |     
 41 |     return retval
 42 | 
 43 | def only_until_first_boxed_from_tokens(string, tokens):
 44 |     idx = string.find("\\boxed")
 45 |     if idx < 0:
 46 |         idx = string.find("\\fbox")
 47 |         if idx < 0:
 48 |             return None
 49 |     
 50 |     cum_length = 0
 51 |     for i, t in enumerate(tokens):
 52 |         cum_length += len(t)
 53 |         if cum_length >= idx:
 54 |             break
 55 |     
 56 |     return tokens[:i]
 57 | 
 58 | 
 59 | 
 60 | def clean_numbers(sample):
 61 |     if not sample:
 62 |         return None
 63 |     new_sample = list()
 64 |     for s in sample:
 65 |         new_sample.append(_clean_numbers(s))
 66 | 
 67 |     return tuple(new_sample)
 68 | 
 69 | def _clean_numbers(string):
 70 |     """
 71 |     Clean Numbers in the given string
 72 | 
 73 |     >>> _clean_numbers(None, "Hello 123")
 74 |     'Hello 123'
 75 |     >>> _clean_numbers(None, "Hello 1234")
 76 |     'Hello 1,234'
 77 |     >>> _clean_numbers(None, "Hello 1234324asdasd")
 78 |     'Hello 1,234,324asdasd'
 79 |     """
 80 |     num_prev_digits = 0
 81 |     new_string = ""
 82 |     for i, c in enumerate(string):
 83 |         # isdigit() doesnt work here because of weird unicode chars.
 84 |         if c in {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}:
 85 |             num_prev_digits += 1
 86 |         else:
 87 |             if num_prev_digits > 3:
 88 |                 # Some fixing
 89 |                 string_number = new_string[-num_prev_digits:]
 90 |                 new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
 91 |             num_prev_digits = 0
 92 |         new_string += c
 93 | 
 94 |     if num_prev_digits > 3:
 95 |         # Some fixing
 96 |         string_number = new_string[-num_prev_digits:]
 97 |         new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
 98 | 
 99 |     return new_string
100 | 


--------------------------------------------------------------------------------
/figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/.DS_Store


--------------------------------------------------------------------------------
/figures/Eurus-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/Eurus-logo.png


--------------------------------------------------------------------------------
/figures/lc_tqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/lc_tqa.png


--------------------------------------------------------------------------------
/figures/leetcode_vs_theoremqa-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/leetcode_vs_theoremqa-1.png


--------------------------------------------------------------------------------
/figures/leetcode_vs_theoremqa-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/leetcode_vs_theoremqa-2.png


--------------------------------------------------------------------------------
/figures/main_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/main_exp.png


--------------------------------------------------------------------------------
/figures/rm_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/rm_exp.png


--------------------------------------------------------------------------------
/figures/rm_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/rm_loss.png


--------------------------------------------------------------------------------
/figures/stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/stats.png


--------------------------------------------------------------------------------
/figures/tree-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/tree-1.png


--------------------------------------------------------------------------------
/figures/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/tree.png


--------------------------------------------------------------------------------
/figures/ui-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/ui-1.png


--------------------------------------------------------------------------------
/figures/ui_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/figures/ui_example.png


--------------------------------------------------------------------------------
/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/Eurus/fd72db1861ef009fede0dbe5bcc9893c9012dacf/paper.pdf


--------------------------------------------------------------------------------