├── tutoreval
├── templates
│ ├── closedbook_generation_template.txt
│ ├── generation_template.txt
│ ├── closedbook_grading_template.txt
│ └── grading_template.txt
├── merge_generations.py
├── grade.sh
├── get_results.py
├── generate.sh
├── grade.py
├── README.md
└── generate.py
├── assets
├── main_radar_fig.pdf
└── main_radar_fig.png
├── tokenization
├── mathmix_combine.py
├── tokenize_metamath.py
└── tokenize_tutorchat.py
├── utils
├── generation_utils.py
└── openai_utils.py
├── .gitignore
└── README.md
/tutoreval/templates/closedbook_generation_template.txt:
--------------------------------------------------------------------------------
1 | {{QUESTION}}
--------------------------------------------------------------------------------
/assets/main_radar_fig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/LM-Science-Tutor/HEAD/assets/main_radar_fig.pdf
--------------------------------------------------------------------------------
/assets/main_radar_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/LM-Science-Tutor/HEAD/assets/main_radar_fig.png
--------------------------------------------------------------------------------
/tutoreval/templates/generation_template.txt:
--------------------------------------------------------------------------------
1 | Here is a passage from a textbook I am trying to understand:
2 |
3 | """
4 | {{CHAPTER}}
5 | """
6 |
7 | {{QUESTION}}
8 |
--------------------------------------------------------------------------------
/tokenization/mathmix_combine.py:
--------------------------------------------------------------------------------
1 | from datasets import Dataset, load_from_disk, concatenate_datasets
2 | import argparse
3 |
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--tutorchat", default="data/tokenized_tutorchat_stem_llama", type=str)
7 | parser.add_argument("--metamath", default="data/tokenized_metamath_concat10_llama", type=str)
8 | parser.add_argument("--save_dir", default="data/mathmix_llama")
9 | args = parser.parse_args()
10 |
11 | tutorchat = load_from_disk(args.tutorchat)["train"]
12 | metamath = load_from_disk(args.metamath)
13 |
14 | to_remove = [k for k in tutorchat.features.keys() if k not in ["input_ids", "attention_mask", "labels", "processed_conversation"]]
15 | tutorchat = tutorchat.remove_columns(to_remove)
16 | tutorchat.rename_column("processed_conversation", "text")
17 |
18 | to_remove = [k for k in metamath.features.keys() if k not in ["input_ids", "attention_mask", "labels", "text"]]
19 | metamath = metamath.remove_columns(to_remove)
20 |
21 | mathmix = concatenate_datasets([tutorchat, metamath])
22 | mathmix = mathmix.shuffle(seed=42)
23 | mathmix.save_to_disk(args.save_dir)
--------------------------------------------------------------------------------
/tutoreval/merge_generations.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 |
5 | if __name__ == "__main__":
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", type=str, help="Generator model")
8 | parser.add_argument("--dir", default="tutoreval/generations", type=str, help="output simulations")
9 | parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances")
10 | parser.add_argument("--closedbook", action="store_true", help="output simulations")
11 |
12 | args = parser.parse_args()
13 |
14 | if args.ddp_worldsize == 1:
15 | print("Generations merged.")
16 | exit()
17 |
18 | if args.closedbook:
19 | files = [f"{args.dir}/closedbook/{args.model}_{rank}_of_{args.ddp_worldsize}.json" for rank in range(args.ddp_worldsize)]
20 | save_file = f"{args.dir}/closedbook/{args.model}.json"
21 | else:
22 | files = [f"{args.dir}/openbook/{args.model}_{rank}_of_{args.ddp_worldsize}.json" for rank in range(args.ddp_worldsize)]
23 | save_file = f"{args.dir}/openbook/{args.model}.json"
24 |
25 | all_generations = []
26 | for file in files:
27 | with open(file) as f:
28 | all_generations += json.load(f)
29 |
30 |
31 |
32 | with open(save_file, "w") as f:
33 | json.dump(all_generations, f, indent=4)
34 | print("Generations merged.")
--------------------------------------------------------------------------------
/utils/generation_utils.py:
--------------------------------------------------------------------------------
1 | from transformers import StoppingCriteria
2 | import torch
3 |
4 | class EosListStoppingCriteria(StoppingCriteria):
5 | def __init__(self, eos_sequence = [835, 2799, 4080, 29901]):
6 | self.eos_sequence = eos_sequence
7 |
8 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
9 | last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
10 | return self.eos_sequence in last_ids
11 |
12 |
13 |
14 |
15 | def generation_utils(query, args, tokenizer):
16 | """Format the queries for dialogue generation and set a stopping criterion. Edit this function to run other models."""
17 |
18 | if args.hf_chat_template:
19 | processed = [tokenizer.apply_chat_template([{"role": "user", "content": q.strip("\n")}], tokenize=False, add_generation_prompt=True) for q in query]
20 | else:
21 | # default formatting
22 | processed = [f"{tokenizer.bos_token}\nuser: {q}{tokenizer.eos_token}\nassistant:" for q in query]
23 |
24 | # custom formatting
25 | if "microsoft/phi" in args.model.lower():
26 | processed = [f"user: {q}\nassistant:" for q in query]
27 |
28 |
29 | # default stopping
30 | stop = [EosListStoppingCriteria(tokenizer.encode(tokenizer.eos_token))]
31 |
32 | # custom stopping
33 | if "wizardmath-7b-v1.0" in args.model.lower():
34 | stop = [EosListStoppingCriteria([13, 829, 29879, 29958])]
35 | elif "microsoft/phi" in args.model.lower():
36 | stop = [EosListStoppingCriteria(tokenizer.encode("\nuser:"))]
37 |
38 | return processed, stop
--------------------------------------------------------------------------------
/tutoreval/grade.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | export OPENAI_API_KEY="" # your api key goes here
5 |
6 | model=${MOD:-"princeton-nlp/Llemma-7B-32K-MathMix"}
7 | closedbook=${CLOSEDBOOK:-false}
8 | grader=${GRADER:-"gpt-4-1106-preview"}
9 | dir=${DIR:-"tutoreval/generations"}
10 | ddp_worldsize=${DDP:-1} #data parallel uses the splits created during generation. Split the generations files if you want to grade faster.
11 |
12 |
13 | header="python -m tutoreval.grade"
14 | args=(
15 | --model ${model}
16 | --grader ${grader}
17 | --dir ${dir}
18 | --ddp_worldsize ${ddp_worldsize}
19 | $@
20 | )
21 |
22 | if [ $closedbook == true ]; then
23 | args+=(--closedbook)
24 | fi
25 |
26 | if [ ${ddp_worldsize} == 1 ]; then
27 | echo "${header} "${args[@]}""
28 | ${header} "${args[@]}"
29 | else
30 | for ((rank=0; rank<=ddp_worldsize-1; rank++)) ; do
31 | ranked_args=(${args[@]} --ddp_rank $rank)
32 | echo "${header} "${ranked_args[@]}""
33 | ${header} "${ranked_args[@]}" &
34 | done
35 | wait
36 | # merge graded files
37 | header="python -m tutoreval.merge_generations"
38 | merge_args=(
39 | --model ${model}
40 | --output_dir ${output_dir}
41 | --ddp_worldsize ${ddp_worldsize}
42 | )
43 |
44 | if [ $closedbook == true ]; then
45 | merge_args+=(--closedbook)
46 | fi
47 |
48 | echo "${header} "${merge_args[@]}""
49 | ${header} "${merge_args[@]}"
50 | fi
51 |
52 |
53 | header="python -m tutoreval.get_results"
54 | args=(
55 | --output_dir ${dir}
56 | --results_dir tutoreval/results
57 | --model ${model}
58 | $@
59 | )
60 |
61 | if [ ${closedbook} == true ]; then
62 | args+=(--closedbook)
63 | fi
64 |
65 | ${header} "${args[@]}"
66 |
--------------------------------------------------------------------------------
/tutoreval/templates/closedbook_grading_template.txt:
--------------------------------------------------------------------------------
1 | You task is to evaluate the teaching abilities of a new AI system which is interacting with a student about a science topic. The student is trying to understand a science topic and has asked the AI a question, and I would like you to rate how well the AI system addressed the student's question.
2 |
3 | You should give scores from 0 to 3 for PRESENTATION and CORRECTNESS. Half points are allowed. Please refer to the following descriptions:
4 |
5 | PRESENTATION: the AI provides an engaging response which will make the student want to learn more. Examples of good presentation skills include: giving the response a clear and helpful structure, picking up on positive aspects of the student's contributions, using examples to clarify complicated ideas, explaining complicated arguments in detail, adding follow-up and broadening remarks, etc.
6 |
7 | CORRECTNESS: the AI correctly understands the question and the answer is true and accurate. The answer does not contain any false or misleading statements. The AI does not include any irrelevant information and does not omit any essential reasoning steps. Pay particular attention to reasoning and calculation mistakes.
8 |
9 | Here is the student's question:
10 |
11 | """
12 | {question}
13 | """
14 |
15 | To help you in your evaluation, we've compiled some ground-truth key points which a good AI system should cover in its answer. You MUST check that the AI's answer agrees with these key points. These key points have been checked by experts and are 100% correct. These key points are particularly useful for spotting CORRECTNESS errors.
16 |
17 | """
18 | Key points to cover:
19 | {key_points}
20 | """
21 |
22 | Here is the AI's answer:
23 |
24 | """
25 | {output}
26 | """
27 |
28 | Please present your scores as follows:
29 | PRESENTATION: [explanation]. GRADE: x/3
30 | CORRECTNESS: [explanation]. GRADE: x/3
31 |
--------------------------------------------------------------------------------
/tutoreval/get_results.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | import pandas as pd
5 | from rich import print
6 |
7 | def get_all_results(df):
8 | pass
9 |
10 |
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--output_dir", default="tutoreval/generations")
15 | parser.add_argument("--results_dir", default="tutoreval/results")
16 | parser.add_argument("--closedbook", action="store_true")
17 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix")
18 | args = parser.parse_args()
19 |
20 | if args.closedbook:
21 | file = f"{args.output_dir}/closedbook/{args.model}.json"
22 | results_file = f"{args.results_dir}/closedbook/{args.model}.json"
23 | else:
24 | file = f"{args.output_dir}/openbook/{args.model}.json"
25 | results_file = f"{args.results_dir}/openbook/{args.model}.json"
26 |
27 | with open(file) as f:
28 | results = json.load(f)
29 |
30 | df = pd.DataFrame(results)
31 |
32 | #scale
33 | df["presentation"] = 100*df["presentation"]/3
34 | df["correctness"] = 100*df["correctness"]/3
35 |
36 |
37 | results = {
38 | "total": df["correctness"].mean(),
39 | "presentation_score": df["presentation"].mean(),
40 | }
41 | # scientific domain
42 | results = results | df.groupby(["domain"])["correctness"].mean().to_dict()
43 |
44 | # difficulty
45 | results = results | df.groupby(["difficulty"])["correctness"].mean().to_dict()
46 |
47 | # misleading
48 | results["misleading_questions"] = df[df["misleading_question"]]["correctness"].mean()
49 |
50 | # answer_in_chapter
51 | results["answer_in_chapter"] = df[df["answer_in_chapter"]]["correctness"].mean()
52 |
53 |
54 | print(results)
55 | os.makedirs(os.path.dirname(results_file), exist_ok=True)
56 | with open(results_file, "w") as f:
57 | json.dump(results, f, indent=4)
58 |
--------------------------------------------------------------------------------
/tutoreval/templates/grading_template.txt:
--------------------------------------------------------------------------------
1 | Your task is to evaluate the teaching abilities of a new AI system which is interacting with a student about a science topic. The student and AI system are working together on a textbook chapter, and I would like you to rate how well the AI system addressed the student's question.
2 |
3 | You should give scores from 0 to 3 for PRESENTATION and CORRECTNESS. Half points are allowed. Please refer to the following descriptions:
4 |
5 | PRESENTATION: the AI provides an engaging response which will make the student want to learn more. Examples of good presentation skills include: giving the response a clear and helpful structure, picking up on positive aspects of the student's contributions, using examples to clarify complicated ideas, explaining complicated arguments in detail, adding follow-up and broadening remarks, etc.
6 |
7 | CORRECTNESS: the AI correctly understands the question and the answer is true and accurate. The answer does not contain any false or misleading statements. The AI does not include any irrelevant information and does not omit any essential reasoning steps. The AI also correctly relates the question to the chapter's content. Pay particular attention to reasoning and calculation mistakes.
8 |
9 | Here is the textbook chapter used for this interaction:
10 |
11 | """
12 | {chapter}
13 | """
14 |
15 | Here is the student's question:
16 |
17 | """
18 | {question}
19 | """
20 |
21 | To help you in your evaluation, we've compiled some ground-truth key points which a good AI system should cover in its answer. You MUST check that the AI's answer agrees with these key points. These key points have been checked by experts and are 100% correct. These key points are particularly useful for spotting CORRECTNESS errors.
22 |
23 | """
24 | Key points to cover:
25 | {key_points}
26 | """
27 |
28 | Here is the AI's answer:
29 |
30 | """
31 | {output}
32 | """
33 |
34 | Please present your scores as follows:
35 | PRESENTATION: [explanation]. GRADE: x/3
36 | CORRECTNESS: [explanation]. GRADE: x/3
37 |
--------------------------------------------------------------------------------
/tutoreval/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export OPENAI_API_KEY="" #your api keys go here
4 |
5 | model=${MOD:-"princeton-nlp/Llemma-7B-32K-MathMix"} #model to evaluate
6 | hf_chat_template=${CHATTEMPLATE:-true}
7 | output_dir=${OUT:-"tutoreval/generations"} #directory to save outputs
8 | batch_size=${BATCH:-1} #batch size during generation
9 | ddp_worldsize=${DDP:-1} #data parallel
10 | closedbook=${CLOSEDBOOK:-false} #TutorEval-ClosedBook evaluation
11 | bnb4bit=${QUANT:-false} #4bit quantization
12 |
13 |
14 |
15 |
16 | ##############
17 | # generate
18 | header="python -m tutoreval.generate"
19 | args=(
20 | --model ${model}
21 | --output_dir ${output_dir}
22 | --batch_size ${batch_size}
23 | --ddp_worldsize ${ddp_worldsize}
24 | $@
25 | )
26 |
27 | if [ $closedbook == true ]; then
28 | args+=(--closedbook)
29 | fi
30 |
31 | if [ $hf_chat_template == true ]; then
32 | args+=(--hf_chat_template)
33 | fi
34 |
35 | if [ $bnb4bit == true ]; then
36 | args+=(--bnb4bit)
37 | fi
38 |
39 | if [ ${ddp_worldsize} == 1 ]; then
40 | echo "${header} "${args[@]}""
41 | ${header} "${args[@]}"
42 | else
43 | for ((rank=0; rank<=ddp_worldsize-1; rank++)) ; do
44 | ranked_args=(${args[@]} --ddp_rank $rank)
45 | echo "${header} "${ranked_args[@]}""
46 | export CUDA_VISIBLE_DEVICES=$rank ; ${header} "${ranked_args[@]}" &
47 | done
48 | wait
49 | fi
50 |
51 | # The current script handles data-parallel and model-sharding separately: setting ddp_worldsize=1 with multiple GPUs will shard the model using device_map="auto".
52 | # When ddp_worldsize is greater than 1, this script automatically assigns a single GPU to each data fragment.
53 | # If you want to use both data-parallel and model sharding, edit CUDA_VISIBLE_DEVICES to fit your situation
54 |
55 |
56 | # merge files
57 | header="python -m tutoreval.merge_generations"
58 | merge_args=(
59 | --model ${model}
60 | --dir ${output_dir}
61 | --ddp_worldsize ${ddp_worldsize}
62 | )
63 |
64 | if [ $closedbook == true ]; then
65 | merge_args+=(--closedbook)
66 | fi
67 |
68 |
69 | echo "${header} "${merge_args[@]}""
70 | ${header} "${merge_args[@]}"
--------------------------------------------------------------------------------
/tokenization/tokenize_metamath.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import random
3 | from transformers import AutoTokenizer
4 | from datasets import Dataset, load_dataset
5 | import json
6 | import argparse
7 |
8 |
9 |
10 | def concat_conversations(dataset, num_concat, tokenizer):
11 | l = len(dataset["query"])
12 |
13 | new_dataset = {k: [] for k in dataset.keys()}
14 | new_dataset["text"] = []
15 | new_dataset["input_ids"] = []
16 | new_dataset["attention_mask"] = []
17 | new_dataset["labels"] = []
18 |
19 | for k in tqdm(range(0, l, num_concat), desc="Concatenating and tokenizing"):
20 | for key in dataset.keys():
21 | new_dataset[key].append([dataset[key][i] for i in range(k, k+num_concat)])
22 |
23 | options = [
24 | ("\nquestion: ", "\nanswer: "),
25 | ("\nQuestion: ", "\nAnswer: "),
26 | ("\nProblem: ", "\nSolution: "),
27 | ("\nproblem: ", "\nsolution: "),
28 | ("\nuser: ", "\nassistant: "),
29 | ("\nassistant: ", "\nuser: ")
30 | ]
31 |
32 | turn0, turn1 = rng.sample(options, 1)[0]
33 | conversation = tokenizer.bos_token
34 | for i in range(k, k + num_concat):
35 | conversation+= "".join([turn0, dataset["query"][i], f"{tokenizer.eos_token}", turn1, dataset["response"][i], f"{tokenizer.eos_token}"])
36 | new_dataset["text"].append(conversation)
37 | new_dataset["input_ids"].append(tokenizer.encode(conversation, add_special_tokens=False))
38 | new_dataset["attention_mask"].append([1]*len(new_dataset["input_ids"][-1]))
39 | new_dataset["labels"].append(new_dataset["input_ids"][-1])
40 | return new_dataset
41 |
42 | if __name__ == "__main__":
43 | parser = argparse.ArgumentParser()
44 | parser.add_argument("--tokenizer", type=str, default="meta-llama/Llama-2-7b-hf", help="Choose the HF tokenizer")
45 | parser.add_argument("--num_concat", type=int, default=10, help="Number of MetaMath samples to concatenate")
46 | parser.add_argument("--save_dir", type=str, default="data/metamath_concat10_llama", help="Directory for saving the HF dataset")
47 | args = parser.parse_args()
48 | rng = random.Random(4)
49 |
50 |
51 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
52 |
53 | data = load_dataset("meta-math/MetaMathQA")
54 | data = data.shuffle(seed=42)
55 | data = data["train"].to_dict()
56 |
57 | tokenized = concat_conversations(data, args.num_concat, tokenizer)
58 | tokenized = Dataset.from_dict(tokenized)
59 | tokenized.save_to_disk(args.save_dir)
60 |
--------------------------------------------------------------------------------
/tutoreval/grade.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import argparse
3 | import json
4 | import re
5 | from utils.openai_utils import OpenAI
6 |
7 |
8 |
9 | def grade(grader_model, generations, args):
10 | for sample in tqdm(generations):
11 | prompt = args.template.format(**sample)
12 | grading_prompt=[prompt]
13 | try:
14 | sample['grading_out'] = grader_model.complete(grading_prompt)
15 | grades = [float(d) for d in re.findall(pattern=r':\s?(\d.*)/3', string=sample["grading_out"])]
16 | sample["presentation"] = grades[0]
17 | sample["correctness"] = grades[1]
18 |
19 | except:
20 | sample['grading_out'] = "ERROR"
21 | sample["presentation"] = 0
22 | sample["correctness"] = 0
23 | return generations
24 |
25 |
26 | if __name__ == '__main__':
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", help="Model whose outputs are evaluated")
29 | parser.add_argument("--dir", default="tutoreval/generations", help="Main directory where model outputs are stored")
30 | parser.add_argument("--closedbook", action="store_true", help="Selects the closedbook folder in main directory")
31 | parser.add_argument("--grader", default="gpt-4-1106-preview", help="OpenAI model used for grading")
32 | parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances")
33 | parser.add_argument("--ddp_rank", default=0, type=int, help="For data parallel. Set this to the data fragment to use for generation. Value should be in range(args.ddp_worldsize)")
34 | args = parser.parse_args()
35 |
36 |
37 | if args.closedbook:
38 | with open("tutoreval/templates/closedbook_grading_template.txt") as f:
39 | args.template = f.read()
40 | if args.ddp_worldsize > 1:
41 | generations_file = f"{args.dir}/closedbook/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json"
42 | else:
43 | generations_file = f"{args.dir}/closedbook/{args.model}.json"
44 | else:
45 | with open("tutoreval/templates/grading_template.txt") as f:
46 | args.template = f.read()
47 | if args.ddp_worldsize > 1:
48 | generations_file = f"{args.dir}/openbook/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json"
49 | else:
50 | generations_file = f"{args.dir}/openbook/{args.model}.json"
51 | with open(generations_file) as f:
52 | generations = json.load(f)
53 |
54 | grader_model = OpenAI(model=args.grader)
55 | print(grader_model.complete(["Hello! Introduce yourself please!"]))
56 |
57 | print("Grading")
58 | graded = grade(grader_model, generations, args)
59 |
60 | with open(generations_file, 'w') as file:
61 | json.dump(graded, file, indent=4)
--------------------------------------------------------------------------------
/tutoreval/README.md:
--------------------------------------------------------------------------------
1 | ## 🧑💻 Evaluating with TutorEval
2 |
3 | ### Requirements
4 |
5 | Please install the following packages:
6 |
7 | ```python
8 | pip install torch flash_attn transformers accelerate bitsandbytes datasets pandas openai rich
9 | ```
10 |
11 | ### ✍️ Generating LM tutor outputs
12 |
13 | #### Basic usage
14 | `generate.py` constructs the LM tutor outputs for each question and saves them under `./openbook`, or `./closedbook` for TutorEval-ClosedBook. Use the HuggingFace model name or the path where the model is stored with the `--model` flag.
15 |
16 | For example, to evaluate Llemma-7B-32K-MathMix on TutorEval:
17 | ```python
18 | python -m tutoreval.generate --model princeton-nlp/Llemma-7B-32K-MathMix
19 | ```
20 |
21 | Use the `--closedbook` flag for TutorEval-ClosedBook:
22 | ```python
23 | python -m tutoreval.generate --model princeton-nlp/Llemma-7B-32K-MathMix --closedbook
24 | ```
25 |
26 | #### Chat templates
27 | By default, TutorEval formats the LM tutor's prompt as a `user/assistant` dialogue. Some HuggingFace models recommend using other chat templates. To use default tokenizer chat templates, use the `--hf_chat_template` flag. For example, to evaluate Mistral-7B-Instruct-v0.2:
28 | ```python
29 | python -m tutoreval.generate --model mistralai/Mistral-7B-Instruct-v0.2 --hf_chat_template
30 | ```
31 |
32 | To set use custom dialogue formatting, we recommend editing `./utils/generation_utils.py`.
33 |
34 | #### Model sharding and data parallel
35 |
36 | To run larger models (e.g. [princeton-nlp/Llemma-34B-MathMix](https://huggingface.co/princeton-nlp/Llemma-34B-MathMix)), `generate.py` uses model parallel with `device_map="auto"`, so no modifications are required.
37 |
38 | Evaluating a 7B model on TutorEval takes approximately 4 hours on a single A100 GPU, so we also provide a basic data-parallel implementation. The number of data parallel instances is specified with the `--ddp_worldsize` flag, and the specific instance to be run is specified with `--ddp_rank`.
39 |
40 | `generate.sh` provides an easy interface for running several instances of `generate.py` on multiple GPUs. For example, if 4 GPUs are available, to evaluate Mistral-7B-v0.2 on TutorEval-ClosedBook, you can use
41 | ```bash
42 | MOD=mistralai/Mistral-7B-Instruct-v0.2 CLOSEDBOOK=true DDP=4 CHATTEMPLATE=true bash tutoreval/generate.sh
43 | ```
44 |
45 | Note that `generate.sh` does not implement model parallel and data parallel simultaneously. Either the model will be sharded across all GPUs, or each GPU runs a separate instance of `generate.py`. If you have lots of GPUs available and you wish to use both methods at the same time, you can modify `generate.sh` to fit your needs by editing `CUDA_VISIBLE_DEVICES`.
46 |
47 | ### ☑️ Grading outputs with GPT-4
48 | `grade.py` grades the LM tutor outputs and updates `./openbook` and `./closedbook` with the GPT-4 grades.
49 | The script `./tutoreval/grade.sh` also provides some utilities for grading.
50 |
--------------------------------------------------------------------------------
/utils/openai_utils.py:
--------------------------------------------------------------------------------
1 | from typing import List, Literal, Optional
2 | from dataclasses import dataclass
3 | import openai
4 | import os
5 | import time
6 | import json
7 |
8 | from filelock import FileLock
9 |
10 | MODEL_CONFIGS = {
11 | "gpt-3.5-turbo-1106": {
12 | "prompt_cost_per_token": 0.001 / 1000,
13 | "response_cost_per_token": 0.002 / 1000,
14 | },
15 | "gpt-3.5-turbo-0125": {
16 | "prompt_cost_per_token": 0.0005 / 1000,
17 | "response_cost_per_token": 0.0015 / 1000,
18 | },
19 | "gpt-4-1106-preview": {
20 | "prompt_cost_per_token": 0.01 / 1000,
21 | "response_cost_per_token": 0.03 / 1000,
22 | },
23 | "gpt-4-0125-preview": {
24 | "prompt_cost_per_token": 0.01 / 1000,
25 | "response_cost_per_token": 0.03 / 1000,
26 | },
27 | }
28 |
29 | @dataclass(frozen=True)
30 | class OpenAI:
31 | model: Literal["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] = "gpt-3.5-turbo-16k"
32 |
33 | temperature: float = 0.7
34 |
35 | system_prompt: Optional[str] = None
36 |
37 | max_retries = 1
38 |
39 | log_file_path = "openai_usage.jsonl"
40 |
41 | def complete(self, conversation: List[str]) -> str:
42 | config = MODEL_CONFIGS[self.model]
43 | openai.api_key = os.environ["OPENAI_API_KEY"]
44 | deployment_name = self.model
45 | retry_count = 0
46 |
47 |
48 | messages = []
49 | if self.system_prompt is not None:
50 | messages.append({"role": "system", "content": self.system_prompt})
51 | for i, prompt in enumerate(conversation):
52 | messages.append({"role": ("user" if i % 2 == 0 else "assistant"), "content": prompt})
53 |
54 | while True:
55 | try:
56 | response = openai.chat.completions.create(
57 | model=deployment_name,
58 | messages=messages,
59 | temperature=self.temperature,
60 | )
61 |
62 | break
63 | except Exception as error:
64 | if "Please retry after" in str(error):
65 | timeout = int(str(error).split("Please retry after ")[1].split(" second")[0]) + 2
66 | print(f"Wait {timeout}s before OpenAI API retry ({error})")
67 | time.sleep(timeout)
68 | elif retry_count < self.max_retries:
69 | print(f"OpenAI API retry for {retry_count} times ({error})")
70 | time.sleep(2)
71 | retry_count += 1
72 | else:
73 | print(f"OpenAI API failed for {retry_count} times ({error})")
74 | return None
75 |
76 | self.log_usage(config, response.usage)
77 |
78 | generation = response.choices[0].message.content
79 | return generation
80 |
81 | def log_usage(self, config, usage):
82 | usage_log = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
83 | usage_log["prompt_cost"] = config["prompt_cost_per_token"] * usage.prompt_tokens
84 | usage_log["completion_cost"] = config["response_cost_per_token"] * usage.completion_tokens
85 | usage_log["cost"] = usage_log["prompt_cost"] + usage_log["completion_cost"]
86 | usage_log["model"] = self.model
87 | usage_log["user"] = os.getlogin()
88 |
89 | with FileLock(self.log_file_path + ".lock"):
90 | with open(self.log_file_path, "a") as f:
91 | f.write(json.dumps(usage_log) + "\n")
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/tokenization/tokenize_tutorchat.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from transformers import AutoTokenizer
4 | from datasets import Dataset, DatasetDict, load_dataset
5 | import random
6 | import argparse
7 |
8 |
9 |
10 | def clean_and_assign(name, all_text):
11 | """takes a simulated conversation, applies basic cleaning, and assigns student/teacher roles to help split the conversation into a dialogue"""
12 | # truncate between first and last occurrence of ###
13 | first = all_text.find("###")
14 | if first <= len(all_text)//2:
15 | all_text = all_text[first+3:]
16 | last = all_text.rfind("###")
17 | if last >= len(all_text)//2:
18 | all_text = all_text[:last]
19 | all_text = all_text.replace("###", "").strip("\n ")
20 |
21 | # assign roles
22 | if "generateexam" in name:
23 | key0, key1 = "QUESTION", "ANSWER"
24 | options = [
25 | ("\nquestion: ", "\nanswer: "),
26 | ("\nuser: ", "\nassistant: "),
27 | ("\nassistant: ", "\nuser: ")
28 | ]
29 | turn0, turn1 = options[rng.sample([0,1,2],1)[0]]
30 | elif "studentstart" in name:
31 | key0, key1 = "STUDENT", "TEACHER"
32 | turn0, turn1 = "\nuser: ", "\nassistant: "
33 | elif "teacherstart" in name:
34 | key0, key1 = "TEACHER", "STUDENT"
35 | turn0, turn1 = "\nassistant: ", "\nuser: "
36 |
37 | # ignore badly formatted texts
38 | if key0 not in all_text:
39 | return
40 | return key0, key1, turn0, turn1, all_text
41 |
42 |
43 | def tokenize(dialogue, tokenizer, args):
44 | input_ids = []
45 | labels = []
46 | processed_conversation = ""
47 | if dialogue["mode"] == "openbook":
48 | for m, turn in enumerate(dialogue["conversation"]):
49 | if m == 0:
50 | turn_text = turn + f"{tokenizer.eos_token}\n{tokenizer.bos_token}"
51 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
52 | labels += [-100]*len(tokenized_turn)
53 | elif m % 2 == 0:
54 | turn_text = "\nassistant: " + turn + f"{tokenizer.eos_token}"
55 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
56 | labels += tokenized_turn
57 | elif m % 2 == 1:
58 | turn_text = "\nuser: " + turn + f"{tokenizer.eos_token}"
59 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
60 | labels += [-100]*len(tokenized_turn)
61 | input_ids += tokenized_turn
62 | processed_conversation += turn_text
63 |
64 | elif dialogue["mode"] == "closedbook":
65 | for m, turn in enumerate(dialogue["conversation"]):
66 | if m % 2 == 0:
67 | turn_text = "\nassistant: " + turn + f"{tokenizer.eos_token}"
68 | if m == 0:
69 | turn_text = f"{tokenizer.bos_token}"+turn_text
70 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
71 | labels += tokenized_turn
72 | elif m % 2 == 1:
73 | turn_text = "\nuser: " + turn + f"{tokenizer.eos_token}"
74 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
75 | labels += [-100]*len(tokenized_turn)
76 | input_ids += tokenized_turn
77 | processed_conversation += turn_text
78 |
79 | elif dialogue["mode"] == "singleturn":
80 | name = dialogue["name"]
81 | # get chapter text and make labels
82 | if "studentstart" in name:
83 | turn_text = dialogue["conversation"][0] + f"{tokenizer.eos_token}\n{tokenizer.bos_token}"
84 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
85 | input_ids += tokenized_turn
86 | labels += [-100]*len(tokenized_turn)
87 | processed_conversation += turn_text
88 | else:
89 | processed_conversation += f"{tokenizer.bos_token}"
90 |
91 | all_text = dialogue["conversation"][-1]
92 | key0, key1, turn0, turn1, all_text = clean_and_assign(name, all_text)
93 |
94 | # split by keys
95 | qa_pairs = all_text.split(key0)
96 | qa_lists = [s.split(key1) for s in qa_pairs]
97 | qa_lists = [s for s in qa_lists if len(s) == 2]
98 | qa_flat = [t.strip(": \n") for s in qa_lists for t in s]
99 | qa_flat = [t for t in qa_flat if t != ""]
100 |
101 | # add turns and make roles
102 | for m, turn in enumerate(qa_flat):
103 | if m % 2 == 0:
104 | turn_text = turn0 + turn + f"{tokenizer.eos_token}"
105 | else:
106 | turn_text = turn1 + turn + f"{tokenizer.eos_token}"
107 | processed_conversation+= turn_text
108 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
109 | input_ids += tokenized_turn
110 | if "studentstart" in name and m % 2 == 0:
111 | labels += [-100]*len(tokenized_turn)
112 | else:
113 | labels += tokenized_turn
114 |
115 | dialogue["input_ids"] = input_ids
116 | dialogue["attention_mask"] = [1]*len(input_ids)
117 | dialogue["labels"] = labels
118 | dialogue["processed_conversation"] = processed_conversation
119 | return dialogue
120 |
121 |
122 |
123 |
124 | if __name__ == "__main__":
125 | parser = argparse.ArgumentParser()
126 | parser.add_argument("--tokenizer", type=str, default="meta-llama/Llama-2-7b-hf", help="Choose the HF tokenizer")
127 | parser.add_argument("--stem_only", action="store_true", help="Tokenize only STEM domains")
128 | parser.add_argument("--save_dir", type=str, default="data/tokenized_tutorchat_llama", help="Directory for saving the HF dataset")
129 | args = parser.parse_args()
130 |
131 | if args.stem_only:
132 | domains = ["bio", "chem", "eng", "geo", "math", "med", "phys", "stats"]
133 | else:
134 | domains = []
135 |
136 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
137 | rng = random.Random(4)
138 |
139 |
140 | all_dialogues = load_dataset("princeton-nlp/TutorChat")
141 | dialogues = all_dialogues.filter(lambda x : x["textbook_folder"].split("/")[1] in domains, num_proc=8) if domains != [] else all_dialogues
142 | validation = dialogues["validation"]
143 | validation = validation.map(lambda x: tokenize(x, tokenizer, args), num_proc=4)
144 | train = dialogues["train"]
145 | train = train.map(lambda x: tokenize(x, tokenizer, args), num_proc=4)
146 |
147 |
148 | tokenized = DatasetDict({
149 | "train": train,
150 | "validation": validation
151 | })
152 |
153 | tokenized.save_to_disk(args.save_dir)
154 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # *Language Models as Science Tutors*
2 |
3 | This is the official repository for [*Language Models as Science Tutors*](https://arxiv.org/abs/2402.11111).
4 |
5 |
6 | ## TutorEval
7 |
8 |
9 |
10 |
11 |
12 |
13 |