├── tutoreval ├── templates │ ├── closedbook_generation_template.txt │ ├── generation_template.txt │ ├── closedbook_grading_template.txt │ └── grading_template.txt ├── merge_generations.py ├── grade.sh ├── get_results.py ├── generate.sh ├── grade.py ├── README.md └── generate.py ├── assets ├── main_radar_fig.pdf └── main_radar_fig.png ├── tokenization ├── mathmix_combine.py ├── tokenize_metamath.py └── tokenize_tutorchat.py ├── utils ├── generation_utils.py └── openai_utils.py ├── .gitignore └── README.md /tutoreval/templates/closedbook_generation_template.txt: -------------------------------------------------------------------------------- 1 | {{QUESTION}} -------------------------------------------------------------------------------- /assets/main_radar_fig.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-nlp/LM-Science-Tutor/HEAD/assets/main_radar_fig.pdf -------------------------------------------------------------------------------- /assets/main_radar_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-nlp/LM-Science-Tutor/HEAD/assets/main_radar_fig.png -------------------------------------------------------------------------------- /tutoreval/templates/generation_template.txt: -------------------------------------------------------------------------------- 1 | Here is a passage from a textbook I am trying to understand: 2 | 3 | """ 4 | {{CHAPTER}} 5 | """ 6 | 7 | {{QUESTION}} 8 | -------------------------------------------------------------------------------- /tokenization/mathmix_combine.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset, load_from_disk, concatenate_datasets 2 | import argparse 3 | 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--tutorchat", default="data/tokenized_tutorchat_stem_llama", type=str) 7 | parser.add_argument("--metamath", default="data/tokenized_metamath_concat10_llama", type=str) 8 | parser.add_argument("--save_dir", default="data/mathmix_llama") 9 | args = parser.parse_args() 10 | 11 | tutorchat = load_from_disk(args.tutorchat)["train"] 12 | metamath = load_from_disk(args.metamath) 13 | 14 | to_remove = [k for k in tutorchat.features.keys() if k not in ["input_ids", "attention_mask", "labels", "processed_conversation"]] 15 | tutorchat = tutorchat.remove_columns(to_remove) 16 | tutorchat.rename_column("processed_conversation", "text") 17 | 18 | to_remove = [k for k in metamath.features.keys() if k not in ["input_ids", "attention_mask", "labels", "text"]] 19 | metamath = metamath.remove_columns(to_remove) 20 | 21 | mathmix = concatenate_datasets([tutorchat, metamath]) 22 | mathmix = mathmix.shuffle(seed=42) 23 | mathmix.save_to_disk(args.save_dir) -------------------------------------------------------------------------------- /tutoreval/merge_generations.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", type=str, help="Generator model") 8 | parser.add_argument("--dir", default="tutoreval/generations", type=str, help="output simulations") 9 | parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances") 10 | parser.add_argument("--closedbook", action="store_true", help="output simulations") 11 | 12 | args = parser.parse_args() 13 | 14 | if args.ddp_worldsize == 1: 15 | print("Generations merged.") 16 | exit() 17 | 18 | if args.closedbook: 19 | files = [f"{args.dir}/closedbook/{args.model}_{rank}_of_{args.ddp_worldsize}.json" for rank in range(args.ddp_worldsize)] 20 | save_file = f"{args.dir}/closedbook/{args.model}.json" 21 | else: 22 | files = [f"{args.dir}/openbook/{args.model}_{rank}_of_{args.ddp_worldsize}.json" for rank in range(args.ddp_worldsize)] 23 | save_file = f"{args.dir}/openbook/{args.model}.json" 24 | 25 | all_generations = [] 26 | for file in files: 27 | with open(file) as f: 28 | all_generations += json.load(f) 29 | 30 | 31 | 32 | with open(save_file, "w") as f: 33 | json.dump(all_generations, f, indent=4) 34 | print("Generations merged.") -------------------------------------------------------------------------------- /utils/generation_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import StoppingCriteria 2 | import torch 3 | 4 | class EosListStoppingCriteria(StoppingCriteria): 5 | def __init__(self, eos_sequence = [835, 2799, 4080, 29901]): 6 | self.eos_sequence = eos_sequence 7 | 8 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 9 | last_ids = input_ids[:,-len(self.eos_sequence):].tolist() 10 | return self.eos_sequence in last_ids 11 | 12 | 13 | 14 | 15 | def generation_utils(query, args, tokenizer): 16 | """Format the queries for dialogue generation and set a stopping criterion. Edit this function to run other models.""" 17 | 18 | if args.hf_chat_template: 19 | processed = [tokenizer.apply_chat_template([{"role": "user", "content": q.strip("\n")}], tokenize=False, add_generation_prompt=True) for q in query] 20 | else: 21 | # default formatting 22 | processed = [f"{tokenizer.bos_token}\nuser: {q}{tokenizer.eos_token}\nassistant:" for q in query] 23 | 24 | # custom formatting 25 | if "microsoft/phi" in args.model.lower(): 26 | processed = [f"user: {q}\nassistant:" for q in query] 27 | 28 | 29 | # default stopping 30 | stop = [EosListStoppingCriteria(tokenizer.encode(tokenizer.eos_token))] 31 | 32 | # custom stopping 33 | if "wizardmath-7b-v1.0" in args.model.lower(): 34 | stop = [EosListStoppingCriteria([13, 829, 29879, 29958])] 35 | elif "microsoft/phi" in args.model.lower(): 36 | stop = [EosListStoppingCriteria(tokenizer.encode("\nuser:"))] 37 | 38 | return processed, stop -------------------------------------------------------------------------------- /tutoreval/grade.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export OPENAI_API_KEY="" # your api key goes here 5 | 6 | model=${MOD:-"princeton-nlp/Llemma-7B-32K-MathMix"} 7 | closedbook=${CLOSEDBOOK:-false} 8 | grader=${GRADER:-"gpt-4-1106-preview"} 9 | dir=${DIR:-"tutoreval/generations"} 10 | ddp_worldsize=${DDP:-1} #data parallel uses the splits created during generation. Split the generations files if you want to grade faster. 11 | 12 | 13 | header="python -m tutoreval.grade" 14 | args=( 15 | --model ${model} 16 | --grader ${grader} 17 | --dir ${dir} 18 | --ddp_worldsize ${ddp_worldsize} 19 | $@ 20 | ) 21 | 22 | if [ $closedbook == true ]; then 23 | args+=(--closedbook) 24 | fi 25 | 26 | if [ ${ddp_worldsize} == 1 ]; then 27 | echo "${header} "${args[@]}"" 28 | ${header} "${args[@]}" 29 | else 30 | for ((rank=0; rank<=ddp_worldsize-1; rank++)) ; do 31 | ranked_args=(${args[@]} --ddp_rank $rank) 32 | echo "${header} "${ranked_args[@]}"" 33 | ${header} "${ranked_args[@]}" & 34 | done 35 | wait 36 | # merge graded files 37 | header="python -m tutoreval.merge_generations" 38 | merge_args=( 39 | --model ${model} 40 | --output_dir ${output_dir} 41 | --ddp_worldsize ${ddp_worldsize} 42 | ) 43 | 44 | if [ $closedbook == true ]; then 45 | merge_args+=(--closedbook) 46 | fi 47 | 48 | echo "${header} "${merge_args[@]}"" 49 | ${header} "${merge_args[@]}" 50 | fi 51 | 52 | 53 | header="python -m tutoreval.get_results" 54 | args=( 55 | --output_dir ${dir} 56 | --results_dir tutoreval/results 57 | --model ${model} 58 | $@ 59 | ) 60 | 61 | if [ ${closedbook} == true ]; then 62 | args+=(--closedbook) 63 | fi 64 | 65 | ${header} "${args[@]}" 66 | -------------------------------------------------------------------------------- /tutoreval/templates/closedbook_grading_template.txt: -------------------------------------------------------------------------------- 1 | You task is to evaluate the teaching abilities of a new AI system which is interacting with a student about a science topic. The student is trying to understand a science topic and has asked the AI a question, and I would like you to rate how well the AI system addressed the student's question. 2 | 3 | You should give scores from 0 to 3 for PRESENTATION and CORRECTNESS. Half points are allowed. Please refer to the following descriptions: 4 | 5 | PRESENTATION: the AI provides an engaging response which will make the student want to learn more. Examples of good presentation skills include: giving the response a clear and helpful structure, picking up on positive aspects of the student's contributions, using examples to clarify complicated ideas, explaining complicated arguments in detail, adding follow-up and broadening remarks, etc. 6 | 7 | CORRECTNESS: the AI correctly understands the question and the answer is true and accurate. The answer does not contain any false or misleading statements. The AI does not include any irrelevant information and does not omit any essential reasoning steps. Pay particular attention to reasoning and calculation mistakes. 8 | 9 | Here is the student's question: 10 | 11 | """ 12 | {question} 13 | """ 14 | 15 | To help you in your evaluation, we've compiled some ground-truth key points which a good AI system should cover in its answer. You MUST check that the AI's answer agrees with these key points. These key points have been checked by experts and are 100% correct. These key points are particularly useful for spotting CORRECTNESS errors. 16 | 17 | """ 18 | Key points to cover: 19 | {key_points} 20 | """ 21 | 22 | Here is the AI's answer: 23 | 24 | """ 25 | {output} 26 | """ 27 | 28 | Please present your scores as follows: 29 | PRESENTATION: [explanation]. GRADE: x/3 30 | CORRECTNESS: [explanation]. GRADE: x/3 31 | -------------------------------------------------------------------------------- /tutoreval/get_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import pandas as pd 5 | from rich import print 6 | 7 | def get_all_results(df): 8 | pass 9 | 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--output_dir", default="tutoreval/generations") 15 | parser.add_argument("--results_dir", default="tutoreval/results") 16 | parser.add_argument("--closedbook", action="store_true") 17 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix") 18 | args = parser.parse_args() 19 | 20 | if args.closedbook: 21 | file = f"{args.output_dir}/closedbook/{args.model}.json" 22 | results_file = f"{args.results_dir}/closedbook/{args.model}.json" 23 | else: 24 | file = f"{args.output_dir}/openbook/{args.model}.json" 25 | results_file = f"{args.results_dir}/openbook/{args.model}.json" 26 | 27 | with open(file) as f: 28 | results = json.load(f) 29 | 30 | df = pd.DataFrame(results) 31 | 32 | #scale 33 | df["presentation"] = 100*df["presentation"]/3 34 | df["correctness"] = 100*df["correctness"]/3 35 | 36 | 37 | results = { 38 | "total": df["correctness"].mean(), 39 | "presentation_score": df["presentation"].mean(), 40 | } 41 | # scientific domain 42 | results = results | df.groupby(["domain"])["correctness"].mean().to_dict() 43 | 44 | # difficulty 45 | results = results | df.groupby(["difficulty"])["correctness"].mean().to_dict() 46 | 47 | # misleading 48 | results["misleading_questions"] = df[df["misleading_question"]]["correctness"].mean() 49 | 50 | # answer_in_chapter 51 | results["answer_in_chapter"] = df[df["answer_in_chapter"]]["correctness"].mean() 52 | 53 | 54 | print(results) 55 | os.makedirs(os.path.dirname(results_file), exist_ok=True) 56 | with open(results_file, "w") as f: 57 | json.dump(results, f, indent=4) 58 | -------------------------------------------------------------------------------- /tutoreval/templates/grading_template.txt: -------------------------------------------------------------------------------- 1 | Your task is to evaluate the teaching abilities of a new AI system which is interacting with a student about a science topic. The student and AI system are working together on a textbook chapter, and I would like you to rate how well the AI system addressed the student's question. 2 | 3 | You should give scores from 0 to 3 for PRESENTATION and CORRECTNESS. Half points are allowed. Please refer to the following descriptions: 4 | 5 | PRESENTATION: the AI provides an engaging response which will make the student want to learn more. Examples of good presentation skills include: giving the response a clear and helpful structure, picking up on positive aspects of the student's contributions, using examples to clarify complicated ideas, explaining complicated arguments in detail, adding follow-up and broadening remarks, etc. 6 | 7 | CORRECTNESS: the AI correctly understands the question and the answer is true and accurate. The answer does not contain any false or misleading statements. The AI does not include any irrelevant information and does not omit any essential reasoning steps. The AI also correctly relates the question to the chapter's content. Pay particular attention to reasoning and calculation mistakes. 8 | 9 | Here is the textbook chapter used for this interaction: 10 | 11 | """ 12 | {chapter} 13 | """ 14 | 15 | Here is the student's question: 16 | 17 | """ 18 | {question} 19 | """ 20 | 21 | To help you in your evaluation, we've compiled some ground-truth key points which a good AI system should cover in its answer. You MUST check that the AI's answer agrees with these key points. These key points have been checked by experts and are 100% correct. These key points are particularly useful for spotting CORRECTNESS errors. 22 | 23 | """ 24 | Key points to cover: 25 | {key_points} 26 | """ 27 | 28 | Here is the AI's answer: 29 | 30 | """ 31 | {output} 32 | """ 33 | 34 | Please present your scores as follows: 35 | PRESENTATION: [explanation]. GRADE: x/3 36 | CORRECTNESS: [explanation]. GRADE: x/3 37 | -------------------------------------------------------------------------------- /tutoreval/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export OPENAI_API_KEY="" #your api keys go here 4 | 5 | model=${MOD:-"princeton-nlp/Llemma-7B-32K-MathMix"} #model to evaluate 6 | hf_chat_template=${CHATTEMPLATE:-true} 7 | output_dir=${OUT:-"tutoreval/generations"} #directory to save outputs 8 | batch_size=${BATCH:-1} #batch size during generation 9 | ddp_worldsize=${DDP:-1} #data parallel 10 | closedbook=${CLOSEDBOOK:-false} #TutorEval-ClosedBook evaluation 11 | bnb4bit=${QUANT:-false} #4bit quantization 12 | 13 | 14 | 15 | 16 | ############## 17 | # generate 18 | header="python -m tutoreval.generate" 19 | args=( 20 | --model ${model} 21 | --output_dir ${output_dir} 22 | --batch_size ${batch_size} 23 | --ddp_worldsize ${ddp_worldsize} 24 | $@ 25 | ) 26 | 27 | if [ $closedbook == true ]; then 28 | args+=(--closedbook) 29 | fi 30 | 31 | if [ $hf_chat_template == true ]; then 32 | args+=(--hf_chat_template) 33 | fi 34 | 35 | if [ $bnb4bit == true ]; then 36 | args+=(--bnb4bit) 37 | fi 38 | 39 | if [ ${ddp_worldsize} == 1 ]; then 40 | echo "${header} "${args[@]}"" 41 | ${header} "${args[@]}" 42 | else 43 | for ((rank=0; rank<=ddp_worldsize-1; rank++)) ; do 44 | ranked_args=(${args[@]} --ddp_rank $rank) 45 | echo "${header} "${ranked_args[@]}"" 46 | export CUDA_VISIBLE_DEVICES=$rank ; ${header} "${ranked_args[@]}" & 47 | done 48 | wait 49 | fi 50 | 51 | # The current script handles data-parallel and model-sharding separately: setting ddp_worldsize=1 with multiple GPUs will shard the model using device_map="auto". 52 | # When ddp_worldsize is greater than 1, this script automatically assigns a single GPU to each data fragment. 53 | # If you want to use both data-parallel and model sharding, edit CUDA_VISIBLE_DEVICES to fit your situation 54 | 55 | 56 | # merge files 57 | header="python -m tutoreval.merge_generations" 58 | merge_args=( 59 | --model ${model} 60 | --dir ${output_dir} 61 | --ddp_worldsize ${ddp_worldsize} 62 | ) 63 | 64 | if [ $closedbook == true ]; then 65 | merge_args+=(--closedbook) 66 | fi 67 | 68 | 69 | echo "${header} "${merge_args[@]}"" 70 | ${header} "${merge_args[@]}" -------------------------------------------------------------------------------- /tokenization/tokenize_metamath.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import random 3 | from transformers import AutoTokenizer 4 | from datasets import Dataset, load_dataset 5 | import json 6 | import argparse 7 | 8 | 9 | 10 | def concat_conversations(dataset, num_concat, tokenizer): 11 | l = len(dataset["query"]) 12 | 13 | new_dataset = {k: [] for k in dataset.keys()} 14 | new_dataset["text"] = [] 15 | new_dataset["input_ids"] = [] 16 | new_dataset["attention_mask"] = [] 17 | new_dataset["labels"] = [] 18 | 19 | for k in tqdm(range(0, l, num_concat), desc="Concatenating and tokenizing"): 20 | for key in dataset.keys(): 21 | new_dataset[key].append([dataset[key][i] for i in range(k, k+num_concat)]) 22 | 23 | options = [ 24 | ("\nquestion: ", "\nanswer: "), 25 | ("\nQuestion: ", "\nAnswer: "), 26 | ("\nProblem: ", "\nSolution: "), 27 | ("\nproblem: ", "\nsolution: "), 28 | ("\nuser: ", "\nassistant: "), 29 | ("\nassistant: ", "\nuser: ") 30 | ] 31 | 32 | turn0, turn1 = rng.sample(options, 1)[0] 33 | conversation = tokenizer.bos_token 34 | for i in range(k, k + num_concat): 35 | conversation+= "".join([turn0, dataset["query"][i], f"{tokenizer.eos_token}", turn1, dataset["response"][i], f"{tokenizer.eos_token}"]) 36 | new_dataset["text"].append(conversation) 37 | new_dataset["input_ids"].append(tokenizer.encode(conversation, add_special_tokens=False)) 38 | new_dataset["attention_mask"].append([1]*len(new_dataset["input_ids"][-1])) 39 | new_dataset["labels"].append(new_dataset["input_ids"][-1]) 40 | return new_dataset 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument("--tokenizer", type=str, default="meta-llama/Llama-2-7b-hf", help="Choose the HF tokenizer") 45 | parser.add_argument("--num_concat", type=int, default=10, help="Number of MetaMath samples to concatenate") 46 | parser.add_argument("--save_dir", type=str, default="data/metamath_concat10_llama", help="Directory for saving the HF dataset") 47 | args = parser.parse_args() 48 | rng = random.Random(4) 49 | 50 | 51 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) 52 | 53 | data = load_dataset("meta-math/MetaMathQA") 54 | data = data.shuffle(seed=42) 55 | data = data["train"].to_dict() 56 | 57 | tokenized = concat_conversations(data, args.num_concat, tokenizer) 58 | tokenized = Dataset.from_dict(tokenized) 59 | tokenized.save_to_disk(args.save_dir) 60 | -------------------------------------------------------------------------------- /tutoreval/grade.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | import json 4 | import re 5 | from utils.openai_utils import OpenAI 6 | 7 | 8 | 9 | def grade(grader_model, generations, args): 10 | for sample in tqdm(generations): 11 | prompt = args.template.format(**sample) 12 | grading_prompt=[prompt] 13 | try: 14 | sample['grading_out'] = grader_model.complete(grading_prompt) 15 | grades = [float(d) for d in re.findall(pattern=r':\s?(\d.*)/3', string=sample["grading_out"])] 16 | sample["presentation"] = grades[0] 17 | sample["correctness"] = grades[1] 18 | 19 | except: 20 | sample['grading_out'] = "ERROR" 21 | sample["presentation"] = 0 22 | sample["correctness"] = 0 23 | return generations 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", help="Model whose outputs are evaluated") 29 | parser.add_argument("--dir", default="tutoreval/generations", help="Main directory where model outputs are stored") 30 | parser.add_argument("--closedbook", action="store_true", help="Selects the closedbook folder in main directory") 31 | parser.add_argument("--grader", default="gpt-4-1106-preview", help="OpenAI model used for grading") 32 | parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances") 33 | parser.add_argument("--ddp_rank", default=0, type=int, help="For data parallel. Set this to the data fragment to use for generation. Value should be in range(args.ddp_worldsize)") 34 | args = parser.parse_args() 35 | 36 | 37 | if args.closedbook: 38 | with open("tutoreval/templates/closedbook_grading_template.txt") as f: 39 | args.template = f.read() 40 | if args.ddp_worldsize > 1: 41 | generations_file = f"{args.dir}/closedbook/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json" 42 | else: 43 | generations_file = f"{args.dir}/closedbook/{args.model}.json" 44 | else: 45 | with open("tutoreval/templates/grading_template.txt") as f: 46 | args.template = f.read() 47 | if args.ddp_worldsize > 1: 48 | generations_file = f"{args.dir}/openbook/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json" 49 | else: 50 | generations_file = f"{args.dir}/openbook/{args.model}.json" 51 | with open(generations_file) as f: 52 | generations = json.load(f) 53 | 54 | grader_model = OpenAI(model=args.grader) 55 | print(grader_model.complete(["Hello! Introduce yourself please!"])) 56 | 57 | print("Grading") 58 | graded = grade(grader_model, generations, args) 59 | 60 | with open(generations_file, 'w') as file: 61 | json.dump(graded, file, indent=4) -------------------------------------------------------------------------------- /tutoreval/README.md: -------------------------------------------------------------------------------- 1 | ## 🧑‍💻 Evaluating with TutorEval 2 | 3 | ### Requirements 4 | 5 | Please install the following packages: 6 | 7 | ```python 8 | pip install torch flash_attn transformers accelerate bitsandbytes datasets pandas openai rich 9 | ``` 10 | 11 | ### ✍️ Generating LM tutor outputs 12 | 13 | #### Basic usage 14 | `generate.py` constructs the LM tutor outputs for each question and saves them under `./openbook`, or `./closedbook` for TutorEval-ClosedBook. Use the HuggingFace model name or the path where the model is stored with the `--model` flag. 15 | 16 | For example, to evaluate Llemma-7B-32K-MathMix on TutorEval: 17 | ```python 18 | python -m tutoreval.generate --model princeton-nlp/Llemma-7B-32K-MathMix 19 | ``` 20 | 21 | Use the `--closedbook` flag for TutorEval-ClosedBook: 22 | ```python 23 | python -m tutoreval.generate --model princeton-nlp/Llemma-7B-32K-MathMix --closedbook 24 | ``` 25 | 26 | #### Chat templates 27 | By default, TutorEval formats the LM tutor's prompt as a `user/assistant` dialogue. Some HuggingFace models recommend using other chat templates. To use default tokenizer chat templates, use the `--hf_chat_template` flag. For example, to evaluate Mistral-7B-Instruct-v0.2: 28 | ```python 29 | python -m tutoreval.generate --model mistralai/Mistral-7B-Instruct-v0.2 --hf_chat_template 30 | ``` 31 | 32 | To set use custom dialogue formatting, we recommend editing `./utils/generation_utils.py`. 33 | 34 | #### Model sharding and data parallel 35 | 36 | To run larger models (e.g. [princeton-nlp/Llemma-34B-MathMix](https://huggingface.co/princeton-nlp/Llemma-34B-MathMix)), `generate.py` uses model parallel with `device_map="auto"`, so no modifications are required. 37 | 38 | Evaluating a 7B model on TutorEval takes approximately 4 hours on a single A100 GPU, so we also provide a basic data-parallel implementation. The number of data parallel instances is specified with the `--ddp_worldsize` flag, and the specific instance to be run is specified with `--ddp_rank`. 39 | 40 | `generate.sh` provides an easy interface for running several instances of `generate.py` on multiple GPUs. For example, if 4 GPUs are available, to evaluate Mistral-7B-v0.2 on TutorEval-ClosedBook, you can use 41 | ```bash 42 | MOD=mistralai/Mistral-7B-Instruct-v0.2 CLOSEDBOOK=true DDP=4 CHATTEMPLATE=true bash tutoreval/generate.sh 43 | ``` 44 | 45 | Note that `generate.sh` does not implement model parallel and data parallel simultaneously. Either the model will be sharded across all GPUs, or each GPU runs a separate instance of `generate.py`. If you have lots of GPUs available and you wish to use both methods at the same time, you can modify `generate.sh` to fit your needs by editing `CUDA_VISIBLE_DEVICES`. 46 | 47 | ### ☑️ Grading outputs with GPT-4 48 | `grade.py` grades the LM tutor outputs and updates `./openbook` and `./closedbook` with the GPT-4 grades. 49 | The script `./tutoreval/grade.sh` also provides some utilities for grading. 50 | -------------------------------------------------------------------------------- /utils/openai_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Literal, Optional 2 | from dataclasses import dataclass 3 | import openai 4 | import os 5 | import time 6 | import json 7 | 8 | from filelock import FileLock 9 | 10 | MODEL_CONFIGS = { 11 | "gpt-3.5-turbo-1106": { 12 | "prompt_cost_per_token": 0.001 / 1000, 13 | "response_cost_per_token": 0.002 / 1000, 14 | }, 15 | "gpt-3.5-turbo-0125": { 16 | "prompt_cost_per_token": 0.0005 / 1000, 17 | "response_cost_per_token": 0.0015 / 1000, 18 | }, 19 | "gpt-4-1106-preview": { 20 | "prompt_cost_per_token": 0.01 / 1000, 21 | "response_cost_per_token": 0.03 / 1000, 22 | }, 23 | "gpt-4-0125-preview": { 24 | "prompt_cost_per_token": 0.01 / 1000, 25 | "response_cost_per_token": 0.03 / 1000, 26 | }, 27 | } 28 | 29 | @dataclass(frozen=True) 30 | class OpenAI: 31 | model: Literal["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] = "gpt-3.5-turbo-16k" 32 | 33 | temperature: float = 0.7 34 | 35 | system_prompt: Optional[str] = None 36 | 37 | max_retries = 1 38 | 39 | log_file_path = "openai_usage.jsonl" 40 | 41 | def complete(self, conversation: List[str]) -> str: 42 | config = MODEL_CONFIGS[self.model] 43 | openai.api_key = os.environ["OPENAI_API_KEY"] 44 | deployment_name = self.model 45 | retry_count = 0 46 | 47 | 48 | messages = [] 49 | if self.system_prompt is not None: 50 | messages.append({"role": "system", "content": self.system_prompt}) 51 | for i, prompt in enumerate(conversation): 52 | messages.append({"role": ("user" if i % 2 == 0 else "assistant"), "content": prompt}) 53 | 54 | while True: 55 | try: 56 | response = openai.chat.completions.create( 57 | model=deployment_name, 58 | messages=messages, 59 | temperature=self.temperature, 60 | ) 61 | 62 | break 63 | except Exception as error: 64 | if "Please retry after" in str(error): 65 | timeout = int(str(error).split("Please retry after ")[1].split(" second")[0]) + 2 66 | print(f"Wait {timeout}s before OpenAI API retry ({error})") 67 | time.sleep(timeout) 68 | elif retry_count < self.max_retries: 69 | print(f"OpenAI API retry for {retry_count} times ({error})") 70 | time.sleep(2) 71 | retry_count += 1 72 | else: 73 | print(f"OpenAI API failed for {retry_count} times ({error})") 74 | return None 75 | 76 | self.log_usage(config, response.usage) 77 | 78 | generation = response.choices[0].message.content 79 | return generation 80 | 81 | def log_usage(self, config, usage): 82 | usage_log = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens} 83 | usage_log["prompt_cost"] = config["prompt_cost_per_token"] * usage.prompt_tokens 84 | usage_log["completion_cost"] = config["response_cost_per_token"] * usage.completion_tokens 85 | usage_log["cost"] = usage_log["prompt_cost"] + usage_log["completion_cost"] 86 | usage_log["model"] = self.model 87 | usage_log["user"] = os.getlogin() 88 | 89 | with FileLock(self.log_file_path + ".lock"): 90 | with open(self.log_file_path, "a") as f: 91 | f.write(json.dumps(usage_log) + "\n") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /tokenization/tokenize_tutorchat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from transformers import AutoTokenizer 4 | from datasets import Dataset, DatasetDict, load_dataset 5 | import random 6 | import argparse 7 | 8 | 9 | 10 | def clean_and_assign(name, all_text): 11 | """takes a simulated conversation, applies basic cleaning, and assigns student/teacher roles to help split the conversation into a dialogue""" 12 | # truncate between first and last occurrence of ### 13 | first = all_text.find("###") 14 | if first <= len(all_text)//2: 15 | all_text = all_text[first+3:] 16 | last = all_text.rfind("###") 17 | if last >= len(all_text)//2: 18 | all_text = all_text[:last] 19 | all_text = all_text.replace("###", "").strip("\n ") 20 | 21 | # assign roles 22 | if "generateexam" in name: 23 | key0, key1 = "QUESTION", "ANSWER" 24 | options = [ 25 | ("\nquestion: ", "\nanswer: "), 26 | ("\nuser: ", "\nassistant: "), 27 | ("\nassistant: ", "\nuser: ") 28 | ] 29 | turn0, turn1 = options[rng.sample([0,1,2],1)[0]] 30 | elif "studentstart" in name: 31 | key0, key1 = "STUDENT", "TEACHER" 32 | turn0, turn1 = "\nuser: ", "\nassistant: " 33 | elif "teacherstart" in name: 34 | key0, key1 = "TEACHER", "STUDENT" 35 | turn0, turn1 = "\nassistant: ", "\nuser: " 36 | 37 | # ignore badly formatted texts 38 | if key0 not in all_text: 39 | return 40 | return key0, key1, turn0, turn1, all_text 41 | 42 | 43 | def tokenize(dialogue, tokenizer, args): 44 | input_ids = [] 45 | labels = [] 46 | processed_conversation = "" 47 | if dialogue["mode"] == "openbook": 48 | for m, turn in enumerate(dialogue["conversation"]): 49 | if m == 0: 50 | turn_text = turn + f"{tokenizer.eos_token}\n{tokenizer.bos_token}" 51 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 52 | labels += [-100]*len(tokenized_turn) 53 | elif m % 2 == 0: 54 | turn_text = "\nassistant: " + turn + f"{tokenizer.eos_token}" 55 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 56 | labels += tokenized_turn 57 | elif m % 2 == 1: 58 | turn_text = "\nuser: " + turn + f"{tokenizer.eos_token}" 59 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 60 | labels += [-100]*len(tokenized_turn) 61 | input_ids += tokenized_turn 62 | processed_conversation += turn_text 63 | 64 | elif dialogue["mode"] == "closedbook": 65 | for m, turn in enumerate(dialogue["conversation"]): 66 | if m % 2 == 0: 67 | turn_text = "\nassistant: " + turn + f"{tokenizer.eos_token}" 68 | if m == 0: 69 | turn_text = f"{tokenizer.bos_token}"+turn_text 70 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 71 | labels += tokenized_turn 72 | elif m % 2 == 1: 73 | turn_text = "\nuser: " + turn + f"{tokenizer.eos_token}" 74 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 75 | labels += [-100]*len(tokenized_turn) 76 | input_ids += tokenized_turn 77 | processed_conversation += turn_text 78 | 79 | elif dialogue["mode"] == "singleturn": 80 | name = dialogue["name"] 81 | # get chapter text and make labels 82 | if "studentstart" in name: 83 | turn_text = dialogue["conversation"][0] + f"{tokenizer.eos_token}\n{tokenizer.bos_token}" 84 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 85 | input_ids += tokenized_turn 86 | labels += [-100]*len(tokenized_turn) 87 | processed_conversation += turn_text 88 | else: 89 | processed_conversation += f"{tokenizer.bos_token}" 90 | 91 | all_text = dialogue["conversation"][-1] 92 | key0, key1, turn0, turn1, all_text = clean_and_assign(name, all_text) 93 | 94 | # split by keys 95 | qa_pairs = all_text.split(key0) 96 | qa_lists = [s.split(key1) for s in qa_pairs] 97 | qa_lists = [s for s in qa_lists if len(s) == 2] 98 | qa_flat = [t.strip(": \n") for s in qa_lists for t in s] 99 | qa_flat = [t for t in qa_flat if t != ""] 100 | 101 | # add turns and make roles 102 | for m, turn in enumerate(qa_flat): 103 | if m % 2 == 0: 104 | turn_text = turn0 + turn + f"{tokenizer.eos_token}" 105 | else: 106 | turn_text = turn1 + turn + f"{tokenizer.eos_token}" 107 | processed_conversation+= turn_text 108 | tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False) 109 | input_ids += tokenized_turn 110 | if "studentstart" in name and m % 2 == 0: 111 | labels += [-100]*len(tokenized_turn) 112 | else: 113 | labels += tokenized_turn 114 | 115 | dialogue["input_ids"] = input_ids 116 | dialogue["attention_mask"] = [1]*len(input_ids) 117 | dialogue["labels"] = labels 118 | dialogue["processed_conversation"] = processed_conversation 119 | return dialogue 120 | 121 | 122 | 123 | 124 | if __name__ == "__main__": 125 | parser = argparse.ArgumentParser() 126 | parser.add_argument("--tokenizer", type=str, default="meta-llama/Llama-2-7b-hf", help="Choose the HF tokenizer") 127 | parser.add_argument("--stem_only", action="store_true", help="Tokenize only STEM domains") 128 | parser.add_argument("--save_dir", type=str, default="data/tokenized_tutorchat_llama", help="Directory for saving the HF dataset") 129 | args = parser.parse_args() 130 | 131 | if args.stem_only: 132 | domains = ["bio", "chem", "eng", "geo", "math", "med", "phys", "stats"] 133 | else: 134 | domains = [] 135 | 136 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) 137 | rng = random.Random(4) 138 | 139 | 140 | all_dialogues = load_dataset("princeton-nlp/TutorChat") 141 | dialogues = all_dialogues.filter(lambda x : x["textbook_folder"].split("/")[1] in domains, num_proc=8) if domains != [] else all_dialogues 142 | validation = dialogues["validation"] 143 | validation = validation.map(lambda x: tokenize(x, tokenizer, args), num_proc=4) 144 | train = dialogues["train"] 145 | train = train.map(lambda x: tokenize(x, tokenizer, args), num_proc=4) 146 | 147 | 148 | tokenized = DatasetDict({ 149 | "train": train, 150 | "validation": validation 151 | }) 152 | 153 | tokenized.save_to_disk(args.save_dir) 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # *Language Models as Science Tutors* 2 | 3 | This is the official repository for [*Language Models as Science Tutors*](https://arxiv.org/abs/2402.11111). 4 | 5 | 6 | ## TutorEval 7 | 8 | 9 | 10 |
11 |

12 | 13 |

14 |
15 | 16 | ### 🎓 About 17 | TutorEval is a question-answering benchmark which evaluates how well a language model (the *LM tutor*) can help a user understand a chapter from a science textbook. TutorEval contains over 800 questions written by 17 expert researchers covering math, computer science, physics, life sciences, and environmental science. TutorEval questions relate to chapters from [TutorChat](https://huggingface.co/datasets/princeton-nlp/TutorChat) (downloaded from [libretexts.org](https://libretexts.org)) and require the model to answer free-form questions written from the point of view of a student. TutorEval questions are very diverse: they may ask for explanations of complicated content, for additional information going beyond the chapter, for verifications of exercise solutions, etc. Download the TutorEval data from HuggingFace at [princeton-nlp/TutorEval](https://huggingface.co/datasets/princeton-nlp/TutorEval). 18 | 19 | TutorEval uses an LM as an evaluator. Once the LM tutor has generated responses to TutorEval questions, the evaluator is prompted to compare the tutor's outputs with a set of ground-truth *key points*. These key points were written by the human experts who created TutorEval, and sketch the most important points that the tutor should cover when answering the student. 20 | 21 | ### 📖 OpenBook and 📕 ClosedBook 22 | 23 | TutorEval questions are very diverse and rely on the textbook chapter in different ways. Some questions explicitly refer to the chapter (*open-book*), and some questions are phrased in such a way that they can be understood without reading the textbook chapter (*closed-book*). This means that TutorEval contains two evaluations in one: 24 | 25 | - 📖 TutorEval with open-book evaluation: this is our main setting and uses all 834 TutorEval questions. The LM tutor is prompted with the entire textbook chapter and the question. This requires LMs to process contexts up to 6,000 words. 26 | - 📕 TutorEval-ClosedBook: this evaluation uses the 370 closed-book questions in TutorEval and prompts the LM tutor without the chapter. This makes it possible to evaluate short-context LMs. 27 | 28 | ### 🏆 Leaderboard 29 | We rank the models based on the full TutorEval score, even though TutorEval-ClosedBook rankings sometimes differ. 30 | |Model|TutorEval| ClosedBook| 31 | |-|-|-| 32 | | GPT-4 | 85.2 | 86.1 | 33 | | Llama-3-70B | 71.3 | 78.3 | 34 | | GPT-3.5-Turbo | 68.3 | 69.6 | 35 | | Phi-3-Medium-128K | 67.6 | 69.5 | 36 | | Mixtral-8x7B | 66.3 | 68.2 | 37 | | Phi-3-Mini-128K | 59.5 | 63.5 | 38 | | Llemma-34B-MathMix | 56.8 | 55.3 | 39 | | Mistral-7B-Instruct-V0.2 | 55.5 | 58.7 | 40 | | Llama-3-8B | 55.3 | 59.1 | 41 | | Mathstral-7B | 53.9 | 55.6 | 42 | | Llemma-7B-32K-MathMix | 50.0 | 45.6 | 43 | | Zephyr-7B-Beta | 45.7 | 49.4 | 44 | | Vicuna-13B-V1.5-16K | 32.9 | 36.8 | 45 | | Mistral-7B-Instruct-V0.1 | 30.5 | 35.5 | 46 | | Gemma-7B-IT | 24.0 | 39.5 | 47 | ### 🧑‍💻 Evaluating on TutorEval 48 | 49 | To evaluate your own model on TutorEval, please use the scripts provided in `./tutoreval`. 50 | 51 | - `./tutoreval/generate.py` produces the LM tutor outputs. 52 | - `./tutoreval/grade.py` uses GPT-4 as an evaluator to grade the LM tutor's outputs. 53 | - `./tutoreval/get_results.py` collects GPT-4's grades to give a breakdown of the final TutorEval performance. 54 | 55 | See `./tutoreval/README.md` for detailed instructions. 56 | 57 | The file`./tutoreval/human_gpt_grades.csv` contains the human grades alongside the GPT-4-1106 grades attributed to four models for each of the TutorEval questions. The human grades can be used to calibrate other LLM judges. Human-LLM correlation can be measured using this dataset as in Appendix C.2, Table 9 in the paper. Note that the TutorEval questions in `human_gpt_grades.csv` may differ slightly from the official set of TutorEval questions as some grammatical typos were corrected after human gradings were completed. 58 | 59 | ## TutorChat 60 | TutorChat is the first dialogue-tuning dataset for science. TutorChat consists of 80,000 synthetic teacher-student dialogues created using GPT-3.5 and GPT-4. Each conversation is grounded in a textbook chapter downloaded from [libretexts.org](https://libretexts.org) and can take various formats: 61 | - open-book teacher-student dialogues, where the student asks questions about a textbook chapter and the teacher gives helpful answers. These discussions are led by the student. 62 | - closed-book dialogues, where the teacher conducts a class based on the textbook chapter. 63 | - textbook exams, which are question/answer pairs based on the textbook chapter. 64 | 65 | We provide TutorChat dialogues for all chapters contained in the TextbookChapters dataset below, which includes humanities and social sciences. 40% of TutorChat dialogues concern STEM subjects. 66 | 67 | Download the TutorChat data from HuggingFace at [princeton-nlp/TutorChat](https://huggingface.co/datasets/princeton-nlp/TutorChat). 68 | 69 | ### 📚 Textbook chapters 70 | Download the processed textbook chapters from HuggingFace at [princeton-nlp/TextbookChapters](https://huggingface.co/datasets/princeton-nlp/TextbookChapters). This dataset was obtained by scraping [libretexts.org](https://libretexts.org) and processing the cleaned HTML files with the HTML-to-LaTeX parser from [Openwebmath](https://github.com/keirp/OpenWebMath). 71 | 72 | ### ⚙️ TutorChat processing 73 | `./tokenization/tokenize_tutorchat.py` tokenizes TutorChat and creates training labels according to the recipe used to train `Llemma-7B-32K-MathMix`. Use the flag `--stem_only` to tokenize only the STEM split of TutorChat. 74 | 75 | ### 🔢 MathMix 76 | MathMix is a fine-tuning dataset composed of the STEM split of TutorChat and a processed version of [MetaMath](https://huggingface.co/datasets/meta-math/MetaMathQA). In `./tokenization`, we provide some scripts to re-create and tokenize MathMix. 77 | 78 | `./tokenization/tokenize_metamath.py` tokenizes MetaMath by randomly concatenating question/answer pairs to form longer samples. Use the flag `--num_concat` to set the number of samples to concatenate. MathMix concatenates 10 samples at a time. 79 | 80 | `./mathmix_combine.py` concatenates and shuffles the tokenized TutorChat and MetaMath datasets to create MathMix. Use the flags `--tutorchat` and `--metamath` to set the paths to your tokenized datasets created with `./tokenization/tokenize_tutorchat.py` and `./tokenization/tokenize_metamath.py`. 81 | 82 | ## Models 83 | Download our models from HuggingFace at [princeton-nlp/Llemma-7B-32K-MathMix](https://huggingface.co/princeton-nlp/Llemma-7B-32K-MathMix) and [princeton-nlp/Llemma-34B-MathMix](https://huggingface.co/princeton-nlp/Llemma-34B-MathMix). 84 | 85 | ## Citation 86 | ```bibtex 87 | @misc{chevalier2024language, 88 | title={Language Models as Science Tutors}, 89 | author={Alexis Chevalier and Jiayi Geng and Alexander Wettig and Howard Chen and Sebastian Mizera and Toni Annala and Max Jameson Aragon and Arturo Rodríguez Fanlo and Simon Frieder and Simon Machado and Akshara Prabhakar and Ellie Thieu and Jiachen T. Wang and Zirui Wang and Xindi Wu and Mengzhou Xia and Wenhan Jia and Jiatong Yu and Jun-Jie Zhu and Zhiyong Jason Ren and Sanjeev Arora and Danqi Chen}, 90 | year={2024}, 91 | eprint={2402.11111}, 92 | archivePrefix={arXiv}, 93 | primaryClass={cs.CL} 94 | } 95 | ``` 96 | 97 | -------------------------------------------------------------------------------- /tutoreval/generate.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | import os 4 | import json 5 | from utils.openai_utils import OpenAI 6 | # from utils.togetherai_utils import TogetherBaseEngine 7 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig 8 | from utils.generation_utils import generation_utils 9 | from datasets import load_dataset, load_from_disk 10 | import torch 11 | 12 | 13 | 14 | def generate_answers(data, template, model, tokenizer=None): 15 | outputs = [] 16 | for sample in tqdm(data): 17 | chapters = sample["chapter"] 18 | questions = sample["question"] 19 | sample["template"] = [template]*len(questions) 20 | query = [template.replace("{{QUESTION}}", q).replace("{{CHAPTER}}", c) for (q,c) in zip(questions, chapters)] 21 | 22 | if "openai/gpt" in args.model: 23 | assert args.batch_size == 1 24 | response = [model.complete(query)] 25 | elif args.togetherapi: 26 | assert args.batch_size == 1 27 | prompt="user: "+ query[0] + "\nassistant: " 28 | response = model.safe_completion(prompt, check_prompt=False)["content"] 29 | else: 30 | query, stop = generation_utils(query, args, tokenizer) 31 | inputs = tokenizer(query, add_special_tokens=False, return_tensors="pt", padding=True) 32 | inputs = inputs.to(model.device) 33 | with torch.inference_mode(): 34 | out = model.generate(inputs=inputs["input_ids"], attention_mask = inputs["attention_mask"], pad_token_id=tokenizer.eos_token_id, stopping_criteria=stop, max_new_tokens=800) 35 | out = out[: , inputs["input_ids"].shape[1]:] 36 | response = tokenizer.batch_decode(out, skip_special_tokens=True) 37 | sample["output"] = response 38 | sample["model"] =[args.model]*len(questions) 39 | sample["closedbook_eval"] = [args.closedbook]*len(questions) 40 | sample["hf_chat_template"] = [args.hf_chat_template]*len(questions) 41 | sample["bnb4bit"] = [args.bnb4bit]*len(questions) 42 | outputs+= [ {k: sample[k][i] for k in sample.keys()} for i in range(len(sample["output"]))] 43 | return outputs 44 | 45 | if __name__ == "__main__": 46 | parser = argparse.ArgumentParser() 47 | 48 | parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", type=str, help="Generator model") 49 | parser.add_argument("--output_dir", default="tutoreval/generations", type=str, help="output simulations") 50 | parser.add_argument("--closedbook", action="store_true", help="output simulations") 51 | parser.add_argument("--hf_chat_template", action="store_true", help="If True, uses the chat template from tokenizer. If False, uses defaut user/assistant formatting and allows custom implementations") 52 | parser.add_argument("--togetherapi", action="store_true", help="use the TogetherAI API") 53 | parser.add_argument("--rope_theta", default=-1, type=int, help="Set a higher RoPE theta for context window extension. If set to -1, use the pre-trained config value.") 54 | parser.add_argument("--batch_size", default=1, type=int, help="Batch size used during generation. Only for locally run models") 55 | parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances") 56 | parser.add_argument("--ddp_rank", default=0, type=int, help="For data parallel. Set this to the data fragment to use for generation. Value should be in range(args.ddp_worldsize)") 57 | parser.add_argument("--bnb4bit", action="store_true", help="Use 4 bit quantization") 58 | 59 | args = parser.parse_args() 60 | 61 | 62 | # load data 63 | try: 64 | data = load_dataset("princeton-nlp/TutorEval")["train"] 65 | except: 66 | try: 67 | data = load_from_disk("tutoreval/tutoreval_dataset/train") 68 | except: 69 | print("Please download the dataset from princeton-nlp/TutorEval and save it under tutoreval/tutoreval_dataset") 70 | exit() 71 | 72 | if args.closedbook: 73 | data = data.filter(lambda x: x["closed_book"]) 74 | with open("tutoreval/templates/closedbook_generation_template.txt", "r") as f: 75 | template = f.read() 76 | else: 77 | with open("tutoreval/templates/generation_template.txt", "r") as f: 78 | template = f.read() 79 | 80 | if args.ddp_worldsize > 1: 81 | assert args.ddp_rank in range(args.ddp_worldsize) 82 | data = data.select(list(range(args.ddp_rank, len(data), args.ddp_worldsize))) 83 | data = torch.utils.data.DataLoader(data, batch_size = args.batch_size, shuffle=False) 84 | 85 | 86 | if "openai/gpt" in args.model: # openai api 87 | # examples: openai/gpt-3.5-turbo-16k openai/gpt-4-1106-preview 88 | engine=args.model.split("/")[1] 89 | print(engine) 90 | args.system_prompt = "You are a helpful science teacher interacting with a keen student. You try your utmost to answer the student's questions and to encourage the student to learn further. You are also very careful to provide clear, accurate, and factual answers, as you must not mislead the student in any way" 91 | model = OpenAI(model=engine, system_prompt=args.system_prompt) 92 | print(model.complete(["Hello! Introduce yourself please!"])) 93 | tokenizer = None 94 | args.batch_size = 1 95 | elif args.togetherapi: 96 | model = TogetherBaseEngine(args.model) 97 | print(model.complete(["user: Hello! Introduce yourself please!\nassistant: "])) 98 | tokenizer = None 99 | args.batch_size = 1 100 | 101 | else: 102 | config = AutoConfig.from_pretrained(args.model) 103 | config.max_new_tokens = 800 104 | config.dtype=torch.bfloat16 105 | config.do_sample = False 106 | config.use_cache=True 107 | if args.rope_theta != -1: 108 | config.rope_theta=args.rope_theta 109 | print(f"Setting RoPE theta = {args.rope_theta}") 110 | tokenizer = AutoTokenizer.from_pretrained(args.model) 111 | tokenizer.pad_token = tokenizer.eos_token 112 | 113 | if args.bnb4bit: 114 | quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) 115 | model = AutoModelForCausalLM.from_pretrained( 116 | args.model, 117 | config=config, 118 | quantization_config=quantization_config, 119 | torch_dtype=torch.bfloat16, 120 | device_map="auto", 121 | attn_implementation="flash_attention_2" 122 | ) 123 | else: 124 | model = AutoModelForCausalLM.from_pretrained( 125 | args.model, 126 | config=config, 127 | torch_dtype=torch.bfloat16, 128 | device_map="auto", 129 | attn_implementation="flash_attention_2" 130 | ) 131 | 132 | model.eval() 133 | 134 | outputs = generate_answers(data, template, model, tokenizer) 135 | 136 | # postprocessing 137 | for out in outputs: 138 | for k in out.keys(): 139 | out[k] = out[k].item() if type(out[k]) == torch.Tensor else out[k] 140 | 141 | # save 142 | if args.closedbook: 143 | base_save = f"{args.output_dir}/closedbook" 144 | else: 145 | base_save = f"{args.output_dir}/openbook" 146 | 147 | if args.ddp_worldsize > 1: 148 | save_dir = f"{base_save}/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json" 149 | else: 150 | save_dir = f"{base_save}/{args.model}.json" 151 | 152 | os.makedirs(os.path.dirname(save_dir), exist_ok=True) 153 | with open(save_dir, "w+") as f: 154 | f.write(json.dumps(outputs, indent=4)) 155 | --------------------------------------------------------------------------------