├── tutoreval
    ├── templates
    │   ├── closedbook_generation_template.txt
    │   ├── generation_template.txt
    │   ├── closedbook_grading_template.txt
    │   └── grading_template.txt
    ├── merge_generations.py
    ├── grade.sh
    ├── get_results.py
    ├── generate.sh
    ├── grade.py
    ├── README.md
    └── generate.py
├── assets
    ├── main_radar_fig.pdf
    └── main_radar_fig.png
├── tokenization
    ├── mathmix_combine.py
    ├── tokenize_metamath.py
    └── tokenize_tutorchat.py
├── utils
    ├── generation_utils.py
    └── openai_utils.py
├── .gitignore
└── README.md


/tutoreval/templates/closedbook_generation_template.txt:
--------------------------------------------------------------------------------
1 | {{QUESTION}}


--------------------------------------------------------------------------------
/assets/main_radar_fig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/LM-Science-Tutor/HEAD/assets/main_radar_fig.pdf


--------------------------------------------------------------------------------
/assets/main_radar_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/LM-Science-Tutor/HEAD/assets/main_radar_fig.png


--------------------------------------------------------------------------------
/tutoreval/templates/generation_template.txt:
--------------------------------------------------------------------------------
1 | Here is a passage from a textbook I am trying to understand: 
2 | 
3 | """
4 | {{CHAPTER}}
5 | """
6 | 
7 | {{QUESTION}}
8 | 


--------------------------------------------------------------------------------
/tokenization/mathmix_combine.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_from_disk, concatenate_datasets
 2 | import argparse
 3 | 
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--tutorchat", default="data/tokenized_tutorchat_stem_llama", type=str)
 7 | parser.add_argument("--metamath", default="data/tokenized_metamath_concat10_llama", type=str)
 8 | parser.add_argument("--save_dir", default="data/mathmix_llama")
 9 | args = parser.parse_args()
10 | 
11 | tutorchat = load_from_disk(args.tutorchat)["train"]
12 | metamath = load_from_disk(args.metamath)
13 | 
14 | to_remove = [k for k in tutorchat.features.keys() if k not in ["input_ids", "attention_mask", "labels", "processed_conversation"]]
15 | tutorchat = tutorchat.remove_columns(to_remove)
16 | tutorchat.rename_column("processed_conversation", "text")
17 | 
18 | to_remove = [k for k in metamath.features.keys() if k not in ["input_ids", "attention_mask", "labels", "text"]]
19 | metamath = metamath.remove_columns(to_remove)
20 | 
21 | mathmix = concatenate_datasets([tutorchat, metamath])
22 | mathmix = mathmix.shuffle(seed=42)
23 | mathmix.save_to_disk(args.save_dir)


--------------------------------------------------------------------------------
/tutoreval/merge_generations.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", type=str, help="Generator model")
 8 |     parser.add_argument("--dir", default="tutoreval/generations", type=str, help="output simulations")
 9 |     parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances")
10 |     parser.add_argument("--closedbook", action="store_true", help="output simulations")
11 | 
12 |     args = parser.parse_args()
13 |     
14 |     if args.ddp_worldsize == 1:
15 |         print("Generations merged.")
16 |         exit()
17 | 
18 |     if args.closedbook:
19 |         files = [f"{args.dir}/closedbook/{args.model}_{rank}_of_{args.ddp_worldsize}.json" for rank in range(args.ddp_worldsize)]
20 |         save_file = f"{args.dir}/closedbook/{args.model}.json"
21 |     else:
22 |         files = [f"{args.dir}/openbook/{args.model}_{rank}_of_{args.ddp_worldsize}.json" for rank in range(args.ddp_worldsize)]
23 |         save_file = f"{args.dir}/openbook/{args.model}.json"
24 | 
25 |     all_generations = []
26 |     for file in files:
27 |         with open(file) as f:
28 |             all_generations += json.load(f)
29 |     
30 | 
31 | 
32 |     with open(save_file, "w") as f:
33 |         json.dump(all_generations, f, indent=4)
34 |     print("Generations merged.")


--------------------------------------------------------------------------------
/utils/generation_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import StoppingCriteria
 2 | import torch
 3 | 
 4 | class EosListStoppingCriteria(StoppingCriteria):
 5 |     def __init__(self, eos_sequence = [835, 2799, 4080, 29901]):
 6 |         self.eos_sequence = eos_sequence
 7 | 
 8 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 9 |         last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
10 |         return self.eos_sequence in last_ids
11 | 
12 | 
13 | 
14 | 
15 | def generation_utils(query, args, tokenizer):
16 |     """Format the queries for dialogue generation and set a stopping criterion. Edit this function to run other models."""
17 | 
18 |     if args.hf_chat_template:
19 |         processed = [tokenizer.apply_chat_template([{"role": "user", "content": q.strip("\n")}], tokenize=False, add_generation_prompt=True) for q in query]
20 |     else:
21 |         # default formatting
22 |         processed = [f"{tokenizer.bos_token}\nuser: {q}{tokenizer.eos_token}\nassistant:" for q in query]
23 | 
24 |         # custom formatting
25 |         if "microsoft/phi" in args.model.lower():
26 |             processed = [f"user: {q}\nassistant:" for q in query]
27 | 
28 | 
29 |     # default stopping
30 |     stop = [EosListStoppingCriteria(tokenizer.encode(tokenizer.eos_token))]
31 | 
32 |     # custom stopping
33 |     if "wizardmath-7b-v1.0" in args.model.lower():
34 |         stop = [EosListStoppingCriteria([13,   829, 29879, 29958])]
35 |     elif "microsoft/phi" in args.model.lower():
36 |         stop = [EosListStoppingCriteria(tokenizer.encode("\nuser:"))]
37 | 
38 |     return processed, stop


--------------------------------------------------------------------------------
/tutoreval/grade.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | export OPENAI_API_KEY=""                                   # your api key goes here
 5 | 
 6 | model=${MOD:-"princeton-nlp/Llemma-7B-32K-MathMix"}
 7 | closedbook=${CLOSEDBOOK:-false}
 8 | grader=${GRADER:-"gpt-4-1106-preview"}
 9 | dir=${DIR:-"tutoreval/generations"}
10 | ddp_worldsize=${DDP:-1}                                    #data parallel uses the splits created during generation. Split the generations files if you want to grade faster.
11 | 
12 | 
13 | header="python -m tutoreval.grade"
14 | args=(
15 |     --model ${model}
16 |     --grader ${grader}
17 |     --dir ${dir}
18 |     --ddp_worldsize ${ddp_worldsize}
19 |     $@
20 | )
21 | 
22 | if [ $closedbook == true ]; then
23 |     args+=(--closedbook)
24 | fi
25 | 
26 | if [ ${ddp_worldsize} == 1 ]; then
27 |     echo "${header} "${args[@]}""
28 |     ${header} "${args[@]}"
29 | else 
30 |     for ((rank=0; rank<=ddp_worldsize-1; rank++)) ; do 
31 |         ranked_args=(${args[@]} --ddp_rank $rank)
32 |         echo "${header} "${ranked_args[@]}""
33 |         ${header} "${ranked_args[@]}" &
34 |     done
35 |     wait
36 |     # merge graded files
37 |     header="python -m tutoreval.merge_generations"
38 |     merge_args=(
39 |         --model ${model} 
40 |         --output_dir ${output_dir} 
41 |         --ddp_worldsize ${ddp_worldsize}
42 |     )
43 | 
44 |     if [ $closedbook == true ]; then
45 |         merge_args+=(--closedbook)
46 |     fi
47 | 
48 |     echo "${header} "${merge_args[@]}""
49 |     ${header} "${merge_args[@]}"  
50 | fi
51 | 
52 | 
53 | header="python -m tutoreval.get_results"
54 | args=(
55 |     --output_dir ${dir}
56 |     --results_dir tutoreval/results
57 |     --model ${model}
58 |     $@
59 | )
60 | 
61 | if [ ${closedbook} == true ]; then
62 |     args+=(--closedbook)
63 | fi
64 | 
65 | ${header} "${args[@]}"  
66 | 


--------------------------------------------------------------------------------
/tutoreval/templates/closedbook_grading_template.txt:
--------------------------------------------------------------------------------
 1 | You task is to evaluate the teaching abilities of a new AI system which is interacting with a student about a science topic. The student is trying to understand a science topic and has asked the AI a question, and I would like you to rate how well the AI system addressed the student's question. 
 2 | 
 3 | You should give scores from 0 to 3 for PRESENTATION and CORRECTNESS. Half points are allowed. Please refer to the following descriptions:
 4 | 
 5 | PRESENTATION: the AI provides an engaging response which will make the student want to learn more. Examples of good presentation skills include: giving the response a clear and helpful structure, picking up on positive aspects of the student's contributions, using examples to clarify complicated ideas, explaining complicated arguments in detail, adding follow-up and broadening remarks, etc.
 6 | 
 7 | CORRECTNESS: the AI correctly understands the question and the answer is true and accurate. The answer does not contain any false or misleading statements. The AI does not include any irrelevant information and does not omit any essential reasoning steps. Pay particular attention to reasoning and calculation mistakes.
 8 | 
 9 | Here is the student's question: 
10 | 
11 | """
12 | {question}
13 | """
14 | 
15 | To help you in your evaluation, we've compiled some ground-truth key points which a good AI system should cover in its answer. You MUST check that the AI's answer agrees with these key points. These key points have been checked by experts and are 100% correct. These key points are particularly useful for spotting CORRECTNESS errors.
16 | 
17 | """
18 | Key points to cover:
19 | {key_points}
20 | """
21 | 
22 | Here is the AI's answer:
23 | 
24 | """
25 | {output}
26 | """
27 | 
28 | Please present your scores as follows:
29 | PRESENTATION: [explanation]. GRADE: x/3
30 | CORRECTNESS: [explanation]. GRADE: x/3
31 | 


--------------------------------------------------------------------------------
/tutoreval/get_results.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import json
 3 | import os
 4 | import pandas as pd 
 5 | from rich import print
 6 | 
 7 | def get_all_results(df):
 8 |     pass
 9 | 
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--output_dir", default="tutoreval/generations")
15 |     parser.add_argument("--results_dir", default="tutoreval/results")
16 |     parser.add_argument("--closedbook", action="store_true")
17 |     parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix")
18 |     args = parser.parse_args()
19 | 
20 |     if args.closedbook:
21 |         file = f"{args.output_dir}/closedbook/{args.model}.json"
22 |         results_file = f"{args.results_dir}/closedbook/{args.model}.json"
23 |     else:
24 |         file = f"{args.output_dir}/openbook/{args.model}.json"
25 |         results_file = f"{args.results_dir}/openbook/{args.model}.json"
26 | 
27 |     with open(file) as f:
28 |         results = json.load(f)
29 | 
30 |     df = pd.DataFrame(results)
31 |     
32 |     #scale 
33 |     df["presentation"] = 100*df["presentation"]/3
34 |     df["correctness"] = 100*df["correctness"]/3
35 | 
36 | 
37 |     results = {
38 |         "total": df["correctness"].mean(),
39 |         "presentation_score": df["presentation"].mean(),
40 |     }
41 |     # scientific domain 
42 |     results = results | df.groupby(["domain"])["correctness"].mean().to_dict()
43 | 
44 |     # difficulty
45 |     results = results | df.groupby(["difficulty"])["correctness"].mean().to_dict()
46 | 
47 |     # misleading
48 |     results["misleading_questions"] = df[df["misleading_question"]]["correctness"].mean()
49 | 
50 |     # answer_in_chapter
51 |     results["answer_in_chapter"] = df[df["answer_in_chapter"]]["correctness"].mean()
52 | 
53 | 
54 |     print(results)
55 |     os.makedirs(os.path.dirname(results_file), exist_ok=True)
56 |     with open(results_file, "w") as f:
57 |         json.dump(results, f, indent=4)
58 | 


--------------------------------------------------------------------------------
/tutoreval/templates/grading_template.txt:
--------------------------------------------------------------------------------
 1 | Your task is to evaluate the teaching abilities of a new AI system which is interacting with a student about a science topic. The student and AI system are working together on a textbook chapter, and I would like you to rate how well the AI system addressed the student's question. 
 2 | 
 3 | You should give scores from 0 to 3 for PRESENTATION and CORRECTNESS. Half points are allowed. Please refer to the following descriptions:
 4 | 
 5 | PRESENTATION: the AI provides an engaging response which will make the student want to learn more. Examples of good presentation skills include: giving the response a clear and helpful structure, picking up on positive aspects of the student's contributions, using examples to clarify complicated ideas, explaining complicated arguments in detail, adding follow-up and broadening remarks, etc.
 6 | 
 7 | CORRECTNESS: the AI correctly understands the question and the answer is true and accurate. The answer does not contain any false or misleading statements. The AI does not include any irrelevant information  and does not omit any essential reasoning steps. The AI also correctly relates the question to the chapter's content. Pay particular attention to reasoning and calculation mistakes.
 8 | 
 9 | Here is the textbook chapter used for this interaction: 
10 | 
11 | """
12 | {chapter}
13 | """
14 | 
15 | Here is the student's question: 
16 | 
17 | """
18 | {question}
19 | """
20 | 
21 | To help you in your evaluation, we've compiled some ground-truth key points which a good AI system should cover in its answer. You MUST check that the AI's answer agrees with these key points. These key points have been checked by experts and are 100% correct. These key points are particularly useful for spotting CORRECTNESS errors.
22 | 
23 | """
24 | Key points to cover:
25 | {key_points}
26 | """
27 | 
28 | Here is the AI's answer:
29 | 
30 | """
31 | {output}
32 | """
33 | 
34 | Please present your scores as follows:
35 | PRESENTATION: [explanation]. GRADE: x/3
36 | CORRECTNESS: [explanation]. GRADE: x/3
37 | 


--------------------------------------------------------------------------------
/tutoreval/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export OPENAI_API_KEY=""                                    #your api keys go here
 4 | 
 5 | model=${MOD:-"princeton-nlp/Llemma-7B-32K-MathMix"}         #model to evaluate
 6 | hf_chat_template=${CHATTEMPLATE:-true}
 7 | output_dir=${OUT:-"tutoreval/generations"}                  #directory to save outputs
 8 | batch_size=${BATCH:-1}                                      #batch size during generation
 9 | ddp_worldsize=${DDP:-1}                                     #data parallel 
10 | closedbook=${CLOSEDBOOK:-false}                             #TutorEval-ClosedBook evaluation
11 | bnb4bit=${QUANT:-false}                                     #4bit quantization
12 | 
13 | 
14 | 
15 | 
16 | ############## 
17 | # generate 
18 | header="python -m tutoreval.generate"
19 | args=(
20 |     --model ${model}
21 |     --output_dir ${output_dir}
22 |     --batch_size ${batch_size}
23 |     --ddp_worldsize ${ddp_worldsize}
24 |     $@
25 | )
26 | 
27 | if [ $closedbook == true ]; then
28 |     args+=(--closedbook)
29 | fi
30 | 
31 | if [ $hf_chat_template == true ]; then
32 |     args+=(--hf_chat_template)
33 | fi
34 | 
35 | if [ $bnb4bit == true ]; then
36 |     args+=(--bnb4bit)
37 | fi
38 | 
39 | if [ ${ddp_worldsize} == 1 ]; then
40 |     echo "${header} "${args[@]}""
41 |     ${header} "${args[@]}"
42 | else 
43 |     for ((rank=0; rank<=ddp_worldsize-1; rank++)) ; do 
44 |         ranked_args=(${args[@]} --ddp_rank $rank)
45 |         echo "${header} "${ranked_args[@]}""
46 |        export CUDA_VISIBLE_DEVICES=$rank ; ${header} "${ranked_args[@]}" &
47 |     done
48 |     wait
49 | fi
50 | 
51 | # The current script handles data-parallel and model-sharding separately: setting ddp_worldsize=1 with multiple GPUs will shard the model using device_map="auto". 
52 | # When ddp_worldsize is greater than 1, this script automatically assigns a single GPU to each data fragment. 
53 | # If you want to use both data-parallel and model sharding, edit CUDA_VISIBLE_DEVICES to fit your situation
54 | 
55 | 
56 | # merge files
57 | header="python -m tutoreval.merge_generations"
58 | merge_args=(
59 |     --model ${model} 
60 |     --dir ${output_dir} 
61 |     --ddp_worldsize ${ddp_worldsize}
62 | )
63 | 
64 | if [ $closedbook == true ]; then
65 |     merge_args+=(--closedbook)
66 | fi
67 | 
68 | 
69 | echo "${header} "${merge_args[@]}""
70 | ${header} "${merge_args[@]}"


--------------------------------------------------------------------------------
/tokenization/tokenize_metamath.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import random
 3 | from transformers import AutoTokenizer
 4 | from datasets import Dataset, load_dataset
 5 | import json
 6 | import argparse
 7 | 
 8 | 
 9 | 
10 | def concat_conversations(dataset, num_concat, tokenizer):
11 |     l = len(dataset["query"])
12 | 
13 |     new_dataset = {k: [] for k in dataset.keys()}
14 |     new_dataset["text"] = []
15 |     new_dataset["input_ids"] = []
16 |     new_dataset["attention_mask"] = []
17 |     new_dataset["labels"] = []
18 | 
19 |     for k in tqdm(range(0, l, num_concat), desc="Concatenating and tokenizing"):
20 |         for key in dataset.keys():
21 |             new_dataset[key].append([dataset[key][i] for i in range(k, k+num_concat)])
22 | 
23 |         options = [
24 |             ("\nquestion: ", "\nanswer: "), 
25 |             ("\nQuestion: ", "\nAnswer: "), 
26 |             ("\nProblem: ", "\nSolution: "), 
27 |             ("\nproblem: ", "\nsolution: "), 
28 |             ("\nuser: ", "\nassistant: "), 
29 |             ("\nassistant: ", "\nuser: ")
30 |             ]
31 | 
32 |         turn0, turn1 = rng.sample(options, 1)[0]
33 |         conversation = tokenizer.bos_token
34 |         for i in range(k, k + num_concat):
35 |             conversation+= "".join([turn0, dataset["query"][i], f"{tokenizer.eos_token}", turn1, dataset["response"][i], f"{tokenizer.eos_token}"])
36 |         new_dataset["text"].append(conversation)
37 |         new_dataset["input_ids"].append(tokenizer.encode(conversation, add_special_tokens=False))
38 |         new_dataset["attention_mask"].append([1]*len(new_dataset["input_ids"][-1]))
39 |         new_dataset["labels"].append(new_dataset["input_ids"][-1])
40 |     return new_dataset
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("--tokenizer", type=str, default="meta-llama/Llama-2-7b-hf", help="Choose the HF tokenizer")
45 |     parser.add_argument("--num_concat", type=int, default=10, help="Number of MetaMath samples to concatenate")
46 |     parser.add_argument("--save_dir", type=str, default="data/metamath_concat10_llama", help="Directory for saving the HF dataset")
47 |     args = parser.parse_args()
48 |     rng = random.Random(4)
49 | 
50 | 
51 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
52 | 
53 |     data = load_dataset("meta-math/MetaMathQA")
54 |     data = data.shuffle(seed=42)
55 |     data = data["train"].to_dict()
56 | 
57 |     tokenized = concat_conversations(data, args.num_concat, tokenizer)
58 |     tokenized = Dataset.from_dict(tokenized)
59 |     tokenized.save_to_disk(args.save_dir)
60 | 


--------------------------------------------------------------------------------
/tutoreval/grade.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import argparse
 3 | import json
 4 | import re
 5 | from utils.openai_utils import OpenAI
 6 | 
 7 | 
 8 | 
 9 | def grade(grader_model, generations, args):
10 |     for sample in tqdm(generations):
11 |         prompt = args.template.format(**sample)
12 |         grading_prompt=[prompt]
13 |         try:
14 |             sample['grading_out'] = grader_model.complete(grading_prompt)
15 |             grades = [float(d) for d in re.findall(pattern=r':\s?(\d.*)/3', string=sample["grading_out"])]
16 |             sample["presentation"] = grades[0]
17 |             sample["correctness"] = grades[1]
18 |             
19 |         except:
20 |             sample['grading_out'] = "ERROR"
21 |             sample["presentation"] = 0
22 |             sample["correctness"] = 0
23 |     return generations
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", help="Model whose outputs are evaluated")
29 |     parser.add_argument("--dir", default="tutoreval/generations", help="Main directory where model outputs are stored")
30 |     parser.add_argument("--closedbook", action="store_true", help="Selects the closedbook folder in main directory")
31 |     parser.add_argument("--grader", default="gpt-4-1106-preview", help="OpenAI model used for grading")
32 |     parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances")
33 |     parser.add_argument("--ddp_rank", default=0, type=int, help="For data parallel. Set this to the data fragment to use for generation. Value should be in range(args.ddp_worldsize)")
34 |     args = parser.parse_args()
35 | 
36 | 
37 |     if args.closedbook:
38 |         with open("tutoreval/templates/closedbook_grading_template.txt") as f:
39 |             args.template = f.read()
40 |         if args.ddp_worldsize > 1:
41 |             generations_file = f"{args.dir}/closedbook/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json"
42 |         else:
43 |             generations_file = f"{args.dir}/closedbook/{args.model}.json"
44 |     else:
45 |         with open("tutoreval/templates/grading_template.txt") as f:
46 |             args.template = f.read()
47 |         if args.ddp_worldsize > 1:
48 |             generations_file = f"{args.dir}/openbook/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json"
49 |         else:
50 |             generations_file = f"{args.dir}/openbook/{args.model}.json"
51 |     with open(generations_file) as f:
52 |         generations = json.load(f)
53 | 
54 |     grader_model = OpenAI(model=args.grader)
55 |     print(grader_model.complete(["Hello! Introduce yourself please!"]))
56 | 
57 |     print("Grading")
58 |     graded = grade(grader_model, generations, args)
59 | 
60 |     with open(generations_file, 'w') as file:
61 |         json.dump(graded, file, indent=4)


--------------------------------------------------------------------------------
/tutoreval/README.md:
--------------------------------------------------------------------------------
 1 | ## 🧑‍💻 Evaluating with TutorEval
 2 | 
 3 | ### Requirements
 4 | 
 5 | Please install the following packages:
 6 | 
 7 | ```python
 8 | pip install torch flash_attn transformers accelerate bitsandbytes datasets pandas openai rich
 9 | ```
10 | 
11 | ### ✍️ Generating LM tutor outputs
12 | 
13 | #### Basic usage 
14 | `generate.py` constructs the LM tutor outputs for each question and saves them under `./openbook`, or `./closedbook` for TutorEval-ClosedBook. Use the HuggingFace model name or the path where the model is stored with the `--model` flag.
15 | 
16 | For example, to evaluate Llemma-7B-32K-MathMix on TutorEval:
17 | ```python
18 | python -m tutoreval.generate --model princeton-nlp/Llemma-7B-32K-MathMix
19 | ```
20 | 
21 | Use the `--closedbook` flag for TutorEval-ClosedBook:
22 | ```python
23 | python -m tutoreval.generate --model princeton-nlp/Llemma-7B-32K-MathMix --closedbook
24 | ```
25 | 
26 | #### Chat templates
27 | By default, TutorEval formats the LM tutor's prompt as a `user/assistant` dialogue. Some HuggingFace models recommend using other chat templates. To use default tokenizer chat templates, use the `--hf_chat_template` flag. For example, to evaluate Mistral-7B-Instruct-v0.2:
28 | ```python
29 | python -m tutoreval.generate --model mistralai/Mistral-7B-Instruct-v0.2 --hf_chat_template
30 | ```
31 | 
32 | To set use custom dialogue formatting, we recommend editing `./utils/generation_utils.py`.
33 | 
34 | #### Model sharding and data parallel
35 | 
36 | To run larger models (e.g. [princeton-nlp/Llemma-34B-MathMix](https://huggingface.co/princeton-nlp/Llemma-34B-MathMix)), `generate.py` uses model parallel with `device_map="auto"`, so no modifications are required. 
37 | 
38 | Evaluating a 7B model on TutorEval takes approximately 4 hours on a single A100 GPU, so we also provide a basic data-parallel implementation. The number of data parallel instances is specified with the `--ddp_worldsize` flag, and the specific instance to be run is specified with `--ddp_rank`. 
39 | 
40 | `generate.sh` provides an easy interface for running several instances of `generate.py` on multiple GPUs. For example, if 4 GPUs are available, to evaluate Mistral-7B-v0.2 on TutorEval-ClosedBook, you can use 
41 | ```bash
42 | MOD=mistralai/Mistral-7B-Instruct-v0.2 CLOSEDBOOK=true DDP=4 CHATTEMPLATE=true bash tutoreval/generate.sh
43 | ```
44 | 
45 | Note that `generate.sh` does not implement model parallel and data parallel simultaneously. Either the model will be sharded across all GPUs, or each GPU runs a separate instance of `generate.py`. If you have lots of GPUs available and you wish to use both methods at the same time, you can modify `generate.sh` to fit your needs by editing `CUDA_VISIBLE_DEVICES`.
46 | 
47 | ### ☑️ Grading outputs with GPT-4
48 | `grade.py` grades the LM tutor outputs and updates `./openbook` and `./closedbook` with the GPT-4 grades.
49 | The script `./tutoreval/grade.sh` also provides some utilities for grading.
50 | 


--------------------------------------------------------------------------------
/utils/openai_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Literal, Optional
 2 | from dataclasses import dataclass
 3 | import openai
 4 | import os
 5 | import time
 6 | import json
 7 | 
 8 | from filelock import FileLock
 9 | 
10 | MODEL_CONFIGS = {   
11 |     "gpt-3.5-turbo-1106": {
12 |         "prompt_cost_per_token": 0.001 / 1000,
13 |         "response_cost_per_token": 0.002 / 1000,
14 |     },
15 |     "gpt-3.5-turbo-0125": {
16 |         "prompt_cost_per_token": 0.0005 / 1000,
17 |         "response_cost_per_token": 0.0015 / 1000,
18 |     },
19 |     "gpt-4-1106-preview": {
20 |         "prompt_cost_per_token": 0.01 / 1000,
21 |         "response_cost_per_token": 0.03 / 1000,
22 |     },
23 |     "gpt-4-0125-preview": {
24 |         "prompt_cost_per_token": 0.01 / 1000,
25 |         "response_cost_per_token": 0.03 / 1000,
26 |     },
27 | }
28 | 
29 | @dataclass(frozen=True)
30 | class OpenAI:
31 |     model: Literal["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] = "gpt-3.5-turbo-16k"
32 | 
33 |     temperature: float = 0.7
34 | 
35 |     system_prompt: Optional[str] = None
36 | 
37 |     max_retries = 1
38 |     
39 |     log_file_path = "openai_usage.jsonl"
40 | 
41 |     def complete(self, conversation: List[str]) -> str:
42 |         config = MODEL_CONFIGS[self.model]
43 |         openai.api_key = os.environ["OPENAI_API_KEY"]
44 |         deployment_name = self.model
45 |         retry_count = 0
46 | 
47 | 
48 |         messages = []
49 |         if self.system_prompt is not None:
50 |             messages.append({"role": "system", "content": self.system_prompt})
51 |         for i, prompt in enumerate(conversation):
52 |             messages.append({"role": ("user" if i % 2 == 0 else "assistant"), "content": prompt})
53 | 
54 |         while True:
55 |             try:
56 |                 response = openai.chat.completions.create(
57 |                     model=deployment_name,
58 |                     messages=messages,
59 |                     temperature=self.temperature,
60 |                 )
61 | 
62 |                 break
63 |             except Exception as error:
64 |                 if "Please retry after" in str(error):
65 |                     timeout = int(str(error).split("Please retry after ")[1].split(" second")[0]) + 2
66 |                     print(f"Wait {timeout}s before OpenAI API retry ({error})")
67 |                     time.sleep(timeout)
68 |                 elif retry_count < self.max_retries:
69 |                     print(f"OpenAI API retry for {retry_count} times ({error})")
70 |                     time.sleep(2)
71 |                     retry_count += 1
72 |                 else:
73 |                     print(f"OpenAI API failed for {retry_count} times ({error})")
74 |                     return None
75 | 
76 |         self.log_usage(config, response.usage)
77 | 
78 |         generation = response.choices[0].message.content
79 |         return generation
80 | 
81 |     def log_usage(self, config, usage):
82 |         usage_log = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
83 |         usage_log["prompt_cost"] = config["prompt_cost_per_token"] * usage.prompt_tokens
84 |         usage_log["completion_cost"] = config["response_cost_per_token"] * usage.completion_tokens
85 |         usage_log["cost"] = usage_log["prompt_cost"] + usage_log["completion_cost"]
86 |         usage_log["model"] = self.model
87 |         usage_log["user"] = os.getlogin()
88 | 
89 |         with FileLock(self.log_file_path + ".lock"):
90 |             with open(self.log_file_path, "a") as f:
91 |                 f.write(json.dumps(usage_log) + "\n")


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/tokenization/tokenize_tutorchat.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from transformers import AutoTokenizer
  4 | from datasets import Dataset, DatasetDict, load_dataset
  5 | import random
  6 | import argparse
  7 | 
  8 | 
  9 | 
 10 | def clean_and_assign(name, all_text):
 11 |     """takes a simulated conversation, applies basic cleaning, and assigns student/teacher roles to help split the conversation into a dialogue"""
 12 |     # truncate between first and last occurrence of ### 
 13 |     first = all_text.find("###")
 14 |     if first <= len(all_text)//2:
 15 |         all_text = all_text[first+3:]
 16 |     last = all_text.rfind("###")
 17 |     if last >= len(all_text)//2:
 18 |         all_text = all_text[:last]
 19 |     all_text = all_text.replace("###", "").strip("\n ")
 20 | 
 21 |     # assign roles 
 22 |     if "generateexam" in name:
 23 |         key0, key1 = "QUESTION", "ANSWER"
 24 |         options = [
 25 |             ("\nquestion: ", "\nanswer: "), 
 26 |             ("\nuser: ", "\nassistant: "), 
 27 |             ("\nassistant: ", "\nuser: ")
 28 |             ]
 29 |         turn0, turn1 = options[rng.sample([0,1,2],1)[0]]
 30 |     elif "studentstart" in name:
 31 |         key0, key1 = "STUDENT", "TEACHER"
 32 |         turn0, turn1 = "\nuser: ", "\nassistant: "
 33 |     elif "teacherstart" in name:
 34 |         key0, key1 = "TEACHER", "STUDENT"
 35 |         turn0, turn1 = "\nassistant: ", "\nuser: "
 36 | 
 37 |     # ignore badly formatted texts
 38 |     if key0 not in all_text:
 39 |         return 
 40 |     return key0, key1, turn0, turn1, all_text
 41 |     
 42 | 
 43 | def tokenize(dialogue, tokenizer, args):
 44 |     input_ids = []
 45 |     labels = []
 46 |     processed_conversation = ""
 47 |     if dialogue["mode"] == "openbook":
 48 |         for m, turn in enumerate(dialogue["conversation"]):
 49 |             if m == 0:
 50 |                 turn_text = turn + f"{tokenizer.eos_token}\n{tokenizer.bos_token}"
 51 |                 tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
 52 |                 labels += [-100]*len(tokenized_turn)
 53 |             elif m % 2 == 0:
 54 |                 turn_text = "\nassistant: " + turn + f"{tokenizer.eos_token}"
 55 |                 tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
 56 |                 labels += tokenized_turn
 57 |             elif m % 2 == 1:
 58 |                 turn_text = "\nuser: " + turn + f"{tokenizer.eos_token}"
 59 |                 tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
 60 |                 labels += [-100]*len(tokenized_turn) 
 61 |             input_ids += tokenized_turn
 62 |             processed_conversation += turn_text
 63 | 
 64 |     elif dialogue["mode"] == "closedbook":
 65 |         for m, turn in enumerate(dialogue["conversation"]):
 66 |             if m % 2 == 0:
 67 |                 turn_text = "\nassistant: " + turn +  f"{tokenizer.eos_token}"
 68 |                 if m == 0:
 69 |                     turn_text = f"{tokenizer.bos_token}"+turn_text
 70 |                 tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
 71 |                 labels += tokenized_turn
 72 |             elif m % 2 == 1:
 73 |                 turn_text = "\nuser: " + turn + f"{tokenizer.eos_token}"
 74 |                 tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
 75 |                 labels += [-100]*len(tokenized_turn)
 76 |             input_ids += tokenized_turn
 77 |             processed_conversation += turn_text
 78 | 
 79 |     elif dialogue["mode"] == "singleturn":
 80 |         name = dialogue["name"]
 81 |         # get chapter text and make labels
 82 |         if "studentstart" in name:
 83 |             turn_text = dialogue["conversation"][0] + f"{tokenizer.eos_token}\n{tokenizer.bos_token}"
 84 |             tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
 85 |             input_ids += tokenized_turn
 86 |             labels += [-100]*len(tokenized_turn)
 87 |             processed_conversation += turn_text
 88 |         else:
 89 |             processed_conversation += f"{tokenizer.bos_token}"
 90 |         
 91 |         all_text = dialogue["conversation"][-1]
 92 |         key0, key1, turn0, turn1, all_text = clean_and_assign(name, all_text)
 93 | 
 94 |         # split by keys
 95 |         qa_pairs = all_text.split(key0)
 96 |         qa_lists = [s.split(key1) for s in qa_pairs]
 97 |         qa_lists = [s for s in qa_lists if len(s) == 2]
 98 |         qa_flat = [t.strip(": \n") for s in qa_lists for t in s]
 99 |         qa_flat = [t for t in qa_flat if t != ""]
100 | 
101 |         # add turns and make roles
102 |         for m, turn in enumerate(qa_flat):
103 |             if m % 2 == 0:
104 |                 turn_text = turn0 + turn + f"{tokenizer.eos_token}"
105 |             else:
106 |                 turn_text = turn1 + turn + f"{tokenizer.eos_token}"
107 |             processed_conversation+= turn_text
108 |             tokenized_turn = tokenizer.encode(turn_text, add_special_tokens=False)
109 |             input_ids += tokenized_turn
110 |             if "studentstart" in name and m % 2 == 0:
111 |                 labels += [-100]*len(tokenized_turn)
112 |             else:
113 |                 labels += tokenized_turn
114 | 
115 |     dialogue["input_ids"] = input_ids
116 |     dialogue["attention_mask"] = [1]*len(input_ids)
117 |     dialogue["labels"] = labels
118 |     dialogue["processed_conversation"] = processed_conversation
119 |     return dialogue
120 | 
121 | 
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     parser = argparse.ArgumentParser()
126 |     parser.add_argument("--tokenizer", type=str, default="meta-llama/Llama-2-7b-hf", help="Choose the HF tokenizer")
127 |     parser.add_argument("--stem_only", action="store_true", help="Tokenize only STEM domains")
128 |     parser.add_argument("--save_dir", type=str, default="data/tokenized_tutorchat_llama", help="Directory for saving the HF dataset")
129 |     args = parser.parse_args()
130 | 
131 |     if args.stem_only:
132 |         domains = ["bio", "chem", "eng", "geo", "math", "med", "phys", "stats"]
133 |     else:
134 |         domains = []
135 | 
136 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
137 |     rng = random.Random(4)
138 | 
139 | 
140 |     all_dialogues = load_dataset("princeton-nlp/TutorChat")
141 |     dialogues = all_dialogues.filter(lambda x : x["textbook_folder"].split("/")[1] in domains, num_proc=8) if domains != [] else all_dialogues
142 |     validation = dialogues["validation"]
143 |     validation = validation.map(lambda x: tokenize(x, tokenizer, args), num_proc=4)
144 |     train = dialogues["train"]
145 |     train = train.map(lambda x: tokenize(x, tokenizer, args), num_proc=4)
146 | 
147 | 
148 |     tokenized = DatasetDict({
149 |         "train": train, 
150 |         "validation": validation
151 |     })
152 |     
153 |     tokenized.save_to_disk(args.save_dir)
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # *Language Models as Science Tutors*
 2 | 
 3 | This is the official repository for [*Language Models as Science Tutors*](https://arxiv.org/abs/2402.11111). 
 4 | 
 5 | 
 6 | ## TutorEval
 7 | 
 8 | 
 9 | 
10 | <br>
11 | <p align="center">
12 | <img src="assets/main_radar_fig.png" width="800">
13 | </p>
14 | <br>
15 | 
16 | ### 🎓 About
17 | TutorEval is a question-answering benchmark which evaluates how well a language model (the *LM tutor*) can help a user understand a chapter from a science textbook. TutorEval contains over 800 questions written by 17 expert researchers covering math, computer science, physics, life sciences, and environmental science. TutorEval questions relate to chapters from [TutorChat](https://huggingface.co/datasets/princeton-nlp/TutorChat) (downloaded from [libretexts.org](https://libretexts.org)) and require the model to answer free-form questions written from the point of view of a student. TutorEval questions are very diverse: they may ask for explanations of complicated content, for additional information going beyond the chapter, for verifications of exercise solutions, etc. Download the TutorEval data from HuggingFace at [princeton-nlp/TutorEval](https://huggingface.co/datasets/princeton-nlp/TutorEval).
18 | 
19 | TutorEval uses an LM as an evaluator. Once the LM tutor has generated responses to TutorEval questions, the evaluator is prompted to compare the tutor's outputs with a set of ground-truth *key points*. These key points were written by the human experts who created TutorEval, and sketch the most important points that the tutor should cover when answering the student.
20 | 
21 | ### 📖 OpenBook and 📕 ClosedBook
22 | 
23 | TutorEval questions are very diverse and rely on the textbook chapter in different ways. Some questions explicitly refer to the chapter (*open-book*), and some questions are phrased in such a way that they can be understood without reading the textbook chapter (*closed-book*). This means that TutorEval contains two evaluations in one:
24 | 
25 | - 📖 TutorEval with open-book evaluation: this is our main setting and uses all 834 TutorEval questions. The LM tutor is prompted with the entire textbook chapter and the question. This requires LMs to process contexts up to 6,000 words.
26 | - 📕 TutorEval-ClosedBook: this evaluation uses the 370 closed-book questions in TutorEval and prompts the LM tutor without the chapter. This makes it possible to evaluate short-context LMs.
27 | 
28 | ### 🏆 Leaderboard
29 | We rank the models based on the full TutorEval score, even though TutorEval-ClosedBook rankings sometimes differ.
30 | |Model|TutorEval| ClosedBook|
31 | |-|-|-|
32 | | GPT-4 | 85.2 | 86.1 | 
33 | | Llama-3-70B | 71.3 | 78.3 |
34 | | GPT-3.5-Turbo | 68.3 | 69.6 | 
35 | | Phi-3-Medium-128K | 67.6 | 69.5 | 
36 | | Mixtral-8x7B | 66.3 | 68.2 | 
37 | | Phi-3-Mini-128K | 59.5 | 63.5 | 
38 | | Llemma-34B-MathMix | 56.8 | 55.3 | 
39 | | Mistral-7B-Instruct-V0.2 | 55.5 | 58.7 | 
40 | | Llama-3-8B | 55.3 | 59.1 |
41 | | Mathstral-7B | 53.9 | 55.6 | 
42 | | Llemma-7B-32K-MathMix | 50.0 | 45.6 | 
43 | | Zephyr-7B-Beta | 45.7 | 49.4 | 
44 | | Vicuna-13B-V1.5-16K | 32.9 | 36.8 | 
45 | | Mistral-7B-Instruct-V0.1 | 30.5  | 35.5 | 
46 | | Gemma-7B-IT | 24.0 | 39.5 | 
47 | ### 🧑‍💻 Evaluating on TutorEval
48 | 
49 | To evaluate your own model on TutorEval, please use the scripts provided in `./tutoreval`.
50 | 
51 | - `./tutoreval/generate.py` produces the LM tutor outputs. 
52 | - `./tutoreval/grade.py` uses GPT-4 as an evaluator to grade the LM tutor's outputs. 
53 | - `./tutoreval/get_results.py` collects GPT-4's grades to give a breakdown of the final TutorEval performance.
54 | 
55 | See `./tutoreval/README.md` for detailed instructions.
56 | 
57 | The file`./tutoreval/human_gpt_grades.csv` contains the human grades alongside the GPT-4-1106 grades attributed to four models for each of the TutorEval questions. The human grades can be used to calibrate other LLM judges. Human-LLM correlation can be measured using this dataset as in Appendix C.2, Table 9 in the paper. Note that the TutorEval questions in `human_gpt_grades.csv` may differ slightly from the official set of TutorEval questions as some grammatical typos were corrected after human gradings were completed.
58 | 
59 | ## TutorChat
60 | TutorChat is the first dialogue-tuning dataset for science. TutorChat consists of 80,000 synthetic teacher-student dialogues created using GPT-3.5 and GPT-4. Each conversation is grounded in a textbook chapter downloaded from [libretexts.org](https://libretexts.org) and can take various formats:
61 | - open-book teacher-student dialogues, where the student asks questions about a textbook chapter and the teacher gives helpful answers. These discussions are led by the student.
62 | - closed-book dialogues, where the teacher conducts a class based on the textbook chapter.
63 | - textbook exams, which are question/answer pairs based on the textbook chapter.
64 | 
65 | We provide TutorChat dialogues for all chapters contained in the TextbookChapters dataset below, which includes humanities and social sciences. 40% of TutorChat dialogues concern STEM subjects.
66 | 
67 | Download the TutorChat data from HuggingFace at [princeton-nlp/TutorChat](https://huggingface.co/datasets/princeton-nlp/TutorChat). 
68 | 
69 | ### 📚 Textbook chapters 
70 | Download the processed textbook chapters from HuggingFace at [princeton-nlp/TextbookChapters](https://huggingface.co/datasets/princeton-nlp/TextbookChapters). This dataset was obtained by scraping [libretexts.org](https://libretexts.org) and processing the cleaned HTML files with the HTML-to-LaTeX parser from [Openwebmath](https://github.com/keirp/OpenWebMath). 
71 | 
72 | ### ⚙️ TutorChat processing
73 | `./tokenization/tokenize_tutorchat.py` tokenizes TutorChat and creates training labels according to the recipe used to train `Llemma-7B-32K-MathMix`. Use the flag `--stem_only` to tokenize only the STEM split of TutorChat.
74 | 
75 | ### 🔢 MathMix
76 | MathMix is a fine-tuning dataset composed of the STEM split of TutorChat and a processed version of [MetaMath](https://huggingface.co/datasets/meta-math/MetaMathQA). In `./tokenization`, we provide some scripts to re-create and tokenize MathMix.
77 | 
78 | `./tokenization/tokenize_metamath.py` tokenizes MetaMath by randomly concatenating question/answer pairs to form longer samples. Use the flag `--num_concat` to set the number of samples to concatenate. MathMix concatenates 10 samples at a time. 
79 | 
80 | `./mathmix_combine.py` concatenates and shuffles the tokenized TutorChat and MetaMath datasets to create MathMix. Use the flags `--tutorchat` and `--metamath` to set the paths to your tokenized datasets created with `./tokenization/tokenize_tutorchat.py` and `./tokenization/tokenize_metamath.py`.
81 | 
82 | ## Models
83 | Download our models from HuggingFace at [princeton-nlp/Llemma-7B-32K-MathMix](https://huggingface.co/princeton-nlp/Llemma-7B-32K-MathMix) and [princeton-nlp/Llemma-34B-MathMix](https://huggingface.co/princeton-nlp/Llemma-34B-MathMix).
84 | 
85 | ## Citation
86 | ```bibtex
87 | @misc{chevalier2024language,
88 |       title={Language Models as Science Tutors}, 
89 |       author={Alexis Chevalier and Jiayi Geng and Alexander Wettig and Howard Chen and Sebastian Mizera and Toni Annala and Max Jameson Aragon and Arturo Rodríguez Fanlo and Simon Frieder and Simon Machado and Akshara Prabhakar and Ellie Thieu and Jiachen T. Wang and Zirui Wang and Xindi Wu and Mengzhou Xia and Wenhan Jia and Jiatong Yu and Jun-Jie Zhu and Zhiyong Jason Ren and Sanjeev Arora and Danqi Chen},
90 |       year={2024},
91 |       eprint={2402.11111},
92 |       archivePrefix={arXiv},
93 |       primaryClass={cs.CL}
94 | }
95 | ```
96 | 
97 | 


--------------------------------------------------------------------------------
/tutoreval/generate.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import argparse
  3 | import os
  4 | import json
  5 | from utils.openai_utils import OpenAI
  6 | # from utils.togetherai_utils import TogetherBaseEngine
  7 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
  8 | from utils.generation_utils import generation_utils
  9 | from datasets import load_dataset, load_from_disk
 10 | import torch
 11 | 
 12 | 
 13 | 
 14 | def generate_answers(data, template, model, tokenizer=None):
 15 |     outputs = []
 16 |     for sample in tqdm(data):
 17 |         chapters = sample["chapter"]
 18 |         questions = sample["question"]
 19 |         sample["template"] = [template]*len(questions)
 20 |         query = [template.replace("{{QUESTION}}", q).replace("{{CHAPTER}}", c) for (q,c) in zip(questions, chapters)]
 21 | 
 22 |         if "openai/gpt" in args.model:
 23 |             assert args.batch_size == 1
 24 |             response = [model.complete(query)]
 25 |         elif args.togetherapi:
 26 |             assert args.batch_size == 1
 27 |             prompt="<s>user: "+ query[0] + "</s>\nassistant: "
 28 |             response = model.safe_completion(prompt, check_prompt=False)["content"]
 29 |         else:
 30 |             query, stop = generation_utils(query, args, tokenizer)
 31 |             inputs = tokenizer(query, add_special_tokens=False, return_tensors="pt", padding=True)
 32 |             inputs = inputs.to(model.device)
 33 |             with torch.inference_mode():
 34 |                 out = model.generate(inputs=inputs["input_ids"], attention_mask = inputs["attention_mask"], pad_token_id=tokenizer.eos_token_id, stopping_criteria=stop, max_new_tokens=800)
 35 |             out = out[: , inputs["input_ids"].shape[1]:]
 36 |             response = tokenizer.batch_decode(out, skip_special_tokens=True)
 37 |         sample["output"] = response
 38 |         sample["model"] =[args.model]*len(questions)
 39 |         sample["closedbook_eval"] = [args.closedbook]*len(questions)
 40 |         sample["hf_chat_template"] = [args.hf_chat_template]*len(questions)
 41 |         sample["bnb4bit"] = [args.bnb4bit]*len(questions)
 42 |         outputs+= [ {k: sample[k][i] for k in sample.keys()} for i in range(len(sample["output"]))]
 43 |     return outputs
 44 | 
 45 | if __name__ == "__main__":
 46 |     parser = argparse.ArgumentParser()
 47 | 
 48 |     parser.add_argument("--model", default="princeton-nlp/Llemma-7B-32K-MathMix", type=str, help="Generator model")
 49 |     parser.add_argument("--output_dir", default="tutoreval/generations", type=str, help="output simulations")
 50 |     parser.add_argument("--closedbook", action="store_true", help="output simulations")
 51 |     parser.add_argument("--hf_chat_template", action="store_true", help="If True, uses the chat template from tokenizer. If False, uses defaut user/assistant formatting and allows custom implementations")
 52 |     parser.add_argument("--togetherapi", action="store_true", help="use the TogetherAI API")
 53 |     parser.add_argument("--rope_theta", default=-1, type=int, help="Set a higher RoPE theta for context window extension. If set to -1, use the pre-trained config value.")
 54 |     parser.add_argument("--batch_size", default=1, type=int, help="Batch size used during generation. Only for locally run models")
 55 |     parser.add_argument("--ddp_worldsize", default=1, type=int, help="For data parallel. Sets the number of parallel instances")
 56 |     parser.add_argument("--ddp_rank", default=0, type=int, help="For data parallel. Set this to the data fragment to use for generation. Value should be in range(args.ddp_worldsize)")
 57 |     parser.add_argument("--bnb4bit", action="store_true", help="Use 4 bit quantization")
 58 |     
 59 |     args = parser.parse_args()
 60 | 
 61 | 
 62 |     # load data
 63 |     try:
 64 |         data = load_dataset("princeton-nlp/TutorEval")["train"]
 65 |     except:
 66 |         try:    
 67 |             data = load_from_disk("tutoreval/tutoreval_dataset/train")
 68 |         except:
 69 |             print("Please download the dataset from princeton-nlp/TutorEval and save it under tutoreval/tutoreval_dataset")
 70 |             exit()
 71 | 
 72 |     if args.closedbook:
 73 |         data = data.filter(lambda x: x["closed_book"])
 74 |         with open("tutoreval/templates/closedbook_generation_template.txt", "r") as f:
 75 |             template = f.read()
 76 |     else:
 77 |         with open("tutoreval/templates/generation_template.txt", "r") as f:
 78 |             template = f.read()
 79 | 
 80 |     if args.ddp_worldsize > 1:
 81 |         assert args.ddp_rank in range(args.ddp_worldsize)
 82 |         data = data.select(list(range(args.ddp_rank, len(data), args.ddp_worldsize)))
 83 |     data = torch.utils.data.DataLoader(data, batch_size = args.batch_size, shuffle=False)
 84 | 
 85 | 
 86 |     if "openai/gpt" in args.model:                                              # openai api
 87 |         # examples: openai/gpt-3.5-turbo-16k openai/gpt-4-1106-preview
 88 |         engine=args.model.split("/")[1]
 89 |         print(engine)
 90 |         args.system_prompt = "You are a helpful science teacher interacting with a keen student. You try your utmost to answer the student's questions and to encourage the student to learn further. You are also very careful to provide clear, accurate, and factual answers, as you must not mislead the student in any way"
 91 |         model = OpenAI(model=engine, system_prompt=args.system_prompt)
 92 |         print(model.complete(["Hello! Introduce yourself please!"]))
 93 |         tokenizer = None
 94 |         args.batch_size = 1
 95 |     elif args.togetherapi:
 96 |         model = TogetherBaseEngine(args.model)
 97 |         print(model.complete(["user: Hello! Introduce yourself please!</s>\nassistant: "]))
 98 |         tokenizer = None
 99 |         args.batch_size = 1
100 | 
101 |     else:            
102 |         config = AutoConfig.from_pretrained(args.model)
103 |         config.max_new_tokens = 800
104 |         config.dtype=torch.bfloat16
105 |         config.do_sample = False
106 |         config.use_cache=True
107 |         if args.rope_theta != -1:
108 |             config.rope_theta=args.rope_theta
109 |             print(f"Setting RoPE theta = {args.rope_theta}")
110 |         tokenizer = AutoTokenizer.from_pretrained(args.model)
111 |         tokenizer.pad_token = tokenizer.eos_token
112 | 
113 |         if args.bnb4bit:
114 |             quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
115 |             model = AutoModelForCausalLM.from_pretrained(
116 |                 args.model, 
117 |                 config=config,
118 |                 quantization_config=quantization_config,
119 |                 torch_dtype=torch.bfloat16,
120 |                 device_map="auto",
121 |                 attn_implementation="flash_attention_2"
122 |                 )
123 |         else:
124 |             model = AutoModelForCausalLM.from_pretrained(
125 |                 args.model, 
126 |                 config=config,
127 |                 torch_dtype=torch.bfloat16,
128 |                 device_map="auto",
129 |                 attn_implementation="flash_attention_2"
130 |                 )            
131 | 
132 |         model.eval()
133 |     
134 |     outputs = generate_answers(data, template, model, tokenizer)
135 | 
136 |     # postprocessing
137 |     for out in outputs:
138 |         for k in out.keys():
139 |             out[k] = out[k].item() if type(out[k]) == torch.Tensor else out[k]
140 |     
141 |     # save
142 |     if args.closedbook:
143 |         base_save = f"{args.output_dir}/closedbook"
144 |     else:
145 |         base_save = f"{args.output_dir}/openbook"
146 | 
147 |     if args.ddp_worldsize > 1:
148 |         save_dir = f"{base_save}/{args.model}_{args.ddp_rank}_of_{args.ddp_worldsize}.json"
149 |     else:
150 |         save_dir = f"{base_save}/{args.model}.json"
151 | 
152 |     os.makedirs(os.path.dirname(save_dir), exist_ok=True)
153 |     with open(save_dir, "w+") as f:
154 |         f.write(json.dumps(outputs, indent=4))
155 | 


--------------------------------------------------------------------------------