├── .gitignore ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── README.md ├── prm800k ├── .gitignore └── README.md ├── requirements.txt ├── scripts ├── convert.py └── read.py └── src ├── convert.py └── prm800k_record.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_Store 3 | /out/ -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "Python: Convert", 5 | "type": "python", 6 | "request": "launch", 7 | "module": "scripts.convert", 8 | "justMyCode": false 9 | }, 10 | { 11 | "name": "Python: Read", 12 | "type": "python", 13 | "request": "launch", 14 | "module": "scripts.read", 15 | "justMyCode": false 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.analysis.extraPaths": ["src"], 3 | "editor.tabSize": 4, 4 | "editor.insertSpaces": true 5 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Scott Logic Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # prm800k-denorm 2 | 3 | This repository is home to a script for converting OpenAI's [PRM800K](https://github.com/openai/prm800k/tree/main) process supervision dataset to a denormalized format in `.parquet` file type, for easier consumption. 4 | 5 | Datasets converted with this tool have been published at [huggingface.co/sl-alex](https://huggingface.co/sl-alex). 6 | 7 | ## Why would I want process supervision? 8 | 9 | See OpenAI's [blog post](https://openai.com/research/improving-mathematical-reasoning-with-process-supervision) and "Let's Verify Step by Step" [paper](https://arxiv.org/abs/2305.20050), but essentially it helps to improve a language model's reasoning and human alignment. 10 | 11 | If you only reward a language model for correct answers, then incentivizes "reward hacking" — reaching correct conclusions via incorrect reasoning. This results in a poor underlying understanding, which fails to generalize, and gives rise to logical mistakes (often called "hallucinations"). 12 | 13 | So, rather than the usual "outcome supervision": let's try "process supervision": reward the language model for making good _progress_ towards solutions. This will improve its reasoning. 14 | We encourage this, by teaching it to complete maths problems step-by-step, showing its workings. 15 | 16 | Our goal with this repository is to give you access to a step-by-step dataset, so that you can train your language model in this way. 17 | 18 | The dataset is semantic, so that you can template it into your prompt style however works best for you. 19 | 20 | ## Dataset types 21 | 22 | In total, the datasets are: 23 | 24 | - [Solutions only](https://huggingface.co/datasets/sl-alex/openai-prm800k-solutions-only) 25 | - [Stepwise best](https://huggingface.co/datasets/sl-alex/openai-prm800k-stepwise-best) 26 | - [Stepwise critic](https://huggingface.co/datasets/sl-alex/openai-prm800k-stepwise-critic) 27 | 28 | _You probably just want one of the three combined datasets rather than one of the twelve per-phase splits._ 29 | 30 | ## Loading a dataset 31 | 32 | You can load a dataset and iterate through it, like so: 33 | 34 | ```python 35 | from datasets import load_dataset, DatasetDict, Dataset 36 | # dd: DatasetDict = load_dataset("sl-alex/openai-prm800k-solutions-only") 37 | # dd: DatasetDict = load_dataset("sl-alex/openai-prm800k-stepwise-best") 38 | dd: DatasetDict = load_dataset("sl-alex/openai-prm800k-stepwise-critic") 39 | 40 | d: Dataset = dd['train'] 41 | 42 | prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. Show your reasoning step-by-step, and indicate your final answer under a heading titled Answer. 43 | 44 | ### Instruction: 45 | {instruction} 46 | 47 | ### Response: 48 | {response_history}""" 49 | 50 | answer_template = """{response} 51 | 52 | # Answer: 53 | {answer}""" 54 | 55 | for record in zip( 56 | d['instruction'], 57 | d['responses'], 58 | d['next_response'], 59 | d['answer'], 60 | d['is_human_response'], 61 | d['is_solution'], 62 | d['is_preferred_response'], 63 | d['rating'] 64 | ): 65 | instruction, responses, next_response, answer, is_human_response, is_solution, is_preferred_response, rating = record 66 | prompt = prompt_template.format( 67 | instruction=instruction, 68 | response_history=''.join((f'{response}\n' for response in responses)), 69 | ) 70 | completion=next_response if answer is None else answer_template.format(response=next_response, answer=answer) 71 | print(f'Prompt:\n<{prompt}>') 72 | print(f'Completion:\n<{completion}>') 73 | ``` 74 | 75 | It should print something like: 76 | 77 | ``` 78 | Prompt: 79 | 86 | Completion: 87 | <7.8 minutes is the same as 7 minutes and 0.8 minutes.> 88 | ``` 89 | 90 | ## Dataset description 91 | 92 | ### Solutions only 93 | 94 | ``` 95 | # the problem statement (e.g. "what is x, given 2x = 1?") 96 | instruction: str 97 | 98 | # history of responses that the bot emitted before reaching this final step 99 | responses: List[str] 100 | 101 | # response emitted for the final step in the conversation. 102 | # it is accompanied by an "answer" subsection, which we capture separately 103 | next_response: str 104 | 105 | # answer subsection of the final response 106 | answer: str 107 | ``` 108 | 109 | Only conversations which achieved `"finish_reason": "solution"` are retained. 110 | 111 | We provide only one record per conversation — the final step. 112 | This record contains a list of all the steps which led to that final step. So you have the whole conversation available. 113 | 114 | You could use this to create the following kind of training data: 115 | 116 | - **Source**: initial instruction 117 | - **Target**: all steps of the conversation, including the final step and answer 118 | 119 | #### Usage 120 | 121 | Example of how to take this sample and turn it into a source->target training pair: 122 | 123 | sample: 124 | ``` 125 | instruction: "What is x, given 2x + 1 = 2?" 126 | 127 | responses: [ 128 | "Okay, let's first rearrange the equation to isolate the x.", 129 | "2x = 2 - 1, which simplifes to 2x = 1", 130 | "Now that the x term is isolated, let's divide both sides to eliminate its coefficient." 131 | ] 132 | 133 | next_response: "x = 1/2" 134 | 135 | answer: "0.5" 136 | ``` 137 | 138 | Example Alpaca-style source prompt: 139 | 140 | ``` 141 | Below is an instruction that describes a task. Write responses which progress toward a solution to the request. Indicate your final answer under a heading Final Answer. 142 | 143 | ### Instruction: 144 | What is x, given 2x + 1 = 2? 145 | 146 | ### Response: 147 | Okay, let's first rearrange the equation to isolate the x. 148 | 149 | ### Response: 150 | 2x = 2 - 1, which simplifes to 2x = 1 151 | 152 | ### Response: 153 | Now that the x term is isolated, let's divide both sides to eliminate its coefficient. 154 | 155 | ### Response: 156 | ``` 157 | 158 | Example Alpaca-style target prompt: 159 | 160 | ``` 161 | x = 1/2 162 | 163 | # Answer 164 | 0.5 165 | ``` 166 | 167 | ### Stepwise best 168 | 169 | ``` 170 | # the problem statement (e.g. "what is x, given 2x = 1?") 171 | instruction: str 172 | 173 | # any responses that were emitted in prior conversation steps 174 | # empty list means it's the first conversation turn 175 | responses: List[str] 176 | 177 | # the response emitted for the current step in the conversation 178 | # if this is the final response in the conversation: the response may be accompanied by 179 | # an answer subsection. we separate this out into the `answer` field below. 180 | next_response: str 181 | 182 | # usually None, but if this is filled in: the final response of the conversation has occurred, 183 | # and has a subsection declaring its overall answer. this field captures that overall answer. 184 | answer: Optional[str] 185 | 186 | # often, there are conversation steps in which the human evaluator had to intervene, 187 | # as no generated response was satisfactory. 188 | is_human_response: bool 189 | ``` 190 | 191 | We provide a record per productive conversation turn. 192 | Whichever is the preferred completion for that conversation turn (i.e. the `chosen_completion` if a bot response was preferred, or a `human_completion` otherwise), is the one we use. 193 | 194 | You could use this to create the following kind of training data: 195 | 196 | **Source**: initial instruction + 0-to-many steps of conversation so far 197 | **Target**: next step of conversation (doesn't necessarily get you to a complete solution) 198 | 199 | #### Usage 200 | 201 | Example of how to take this sample and turn it into a source->target training pair: 202 | 203 | sample: 204 | ``` 205 | instruction: "What is x, given 2x + 1 = 2?" 206 | 207 | responses: [ 208 | "Okay, let's first rearrange the equation to isolate the x.", 209 | "2x = 2 - 1, which simplifes to 2x = 1", 210 | ] 211 | 212 | next_response: "Now that the x term is isolated, let's divide both sides to eliminate its coefficient." 213 | 214 | answer: None 215 | 216 | is_human_response: False 217 | ``` 218 | 219 | Example Alpaca-style source prompt: 220 | 221 | ``` 222 | Below is an instruction that describes a task. Write responses which progress toward a solution to the request. Indicate your final answer under a heading Final Answer. 223 | 224 | ### Instruction: 225 | What is x, given 2x + 1 = 2? 226 | 227 | ### Response: 228 | Okay, let's first rearrange the equation to isolate the x. 229 | 230 | ### Response: 231 | 2x = 2 - 1, which simplifes to 2x = 1 232 | 233 | ### Response: 234 | ``` 235 | 236 | Example Alpaca-style target prompt: 237 | 238 | ``` 239 | Now that the x term is isolated, let's divide both sides to eliminate its coefficient. 240 | ``` 241 | 242 | Notice how this time there was no "answer" subsection, because this conversation step does not propose a solution. It can happen though, so check whether answer is `None`. 243 | 244 | ### Stepwise critique 245 | 246 | ``` 247 | # the problem statement (e.g. "what is x, given 2x = 1?") 248 | instruction: str 249 | 250 | # any responses that were emitted in prior conversation steps 251 | # empty list means it's the first conversation turn 252 | responses: List[str] 253 | 254 | # the response emitted for the current step in the conversation 255 | # if this is the final response in the conversation: the response may be accompanied by 256 | # an answer subsection. we separate this out into the `answer` field below. 257 | next_response: str 258 | 259 | # usually None, but if this is filled in: the final response of the conversation has occurred, 260 | # and has a subsection declaring its overall answer. this field captures that overall answer. 261 | answer: Optional[str] 262 | 263 | # often, there are conversation steps in which the human evaluator had to intervene, 264 | # as no generated response was satisfactory. 265 | is_human_response: bool 266 | 267 | # whether an answer was provided **and** was correct 268 | is_solution: bool 269 | 270 | # tells us if this completion was rated as the best option by the human evaluator 271 | is_preferred_response: bool 272 | 273 | # -1 = counterproductive 274 | # 0 = unproductive 275 | # 1 = productive 276 | # None = human answer (you can treat this as 1) 277 | rating: Optional[int] 278 | ``` 279 | 280 | We provide a record per proposed completion, per conversation turn. 281 | This includes human ratings. 282 | 283 | You could use this to create the following kind of training data: 284 | 285 | **Source**: initial instruction + 0-to-many steps of conversation so far 286 | **Target**: a possible completion and rating 287 | 288 | The data here includes both good and bad rated samples. 289 | 290 | You could just keep the good ones. That would be similar to what Stepwise best gives you, but instead of "best" it gives you a wider "any answer which got a good rating, even if it wasn't the best one". So, more data (lower quality). 291 | 292 | But an even better usage is to use this to train a critic. @crowsonkb has demonstated that if you have a classifier, you can [guide your sampling](https://gist.github.com/crowsonkb/af6135392cc1627f40b03456aa90810c) of the language model, generating a few candidate next-tokens and picking the one that your classifier prefers. 293 | So you could use this to train a critic, to guide another language model to employ more reasoning in its responses. 294 | 295 | ### Usage 296 | 297 | You'd format this the same as stepwise best. 298 | 299 | ## Setup 300 | 301 | _Note: if you're happy with the [published datasets](https://huggingface.co/sl-alex), then there's no need for you to get this repository set up yourself — these Setup instructions are for tinkerers who wish to try the export themselves, or change the format_. 302 | 303 | ### Get OpenAI data 304 | 305 | Copy the PRM800K and MATH datasets into the `prm800k` directory, in accordance with [`prm800k/README.md`](prm800k/README.md). 306 | 307 | ### Install Python dependencies 308 | 309 | ```bash 310 | pip install -r requirements.txt 311 | ``` 312 | 313 | ### Run the converter 314 | 315 | ```bash 316 | python -m scripts.convert 317 | ``` 318 | 319 | This should output some .parquet files to a directory `out`, at the root of the repository. 320 | -------------------------------------------------------------------------------- /prm800k/.gitignore: -------------------------------------------------------------------------------- 1 | /data/ 2 | /math_splits/ -------------------------------------------------------------------------------- /prm800k/README.md: -------------------------------------------------------------------------------- 1 | Please copy into this directory, some resources from [`openai/prm800k`](https://github.com/openai/prm800k): 2 | 3 | - [`data/`](https://github.com/openai/prm800k/tree/main/prm800k/data) 4 | - [`math_splits/`](https://github.com/openai/prm800k/tree/main/prm800k/math_splits) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyarrow 2 | pandas -------------------------------------------------------------------------------- /scripts/convert.py: -------------------------------------------------------------------------------- 1 | from os import makedirs, listdir 2 | from os.path import dirname, realpath, exists 3 | from pathlib import Path 4 | import pyarrow as pa 5 | from pyarrow.parquet import ParquetWriter 6 | import json 7 | from typing import Iterable, NamedTuple, List, Optional 8 | from logging import getLogger, Logger 9 | from src.prm800k_record import PRMRecord 10 | from src.convert import make_telescoping_conversation, make_critiques, Sample, GiveUp, Malformed 11 | import fnmatch 12 | 13 | logger: Logger = getLogger(__file__) 14 | 15 | stepwise_schema = pa.schema([ 16 | ('instruction', pa.string()), 17 | ('responses', pa.list_(pa.string())), 18 | ('next_response', pa.string()), 19 | pa.field('answer', pa.string(), nullable=True), 20 | ('is_human_response', pa.bool_()), 21 | ]) 22 | 23 | answer_only_schema = pa.schema([ 24 | ('instruction', pa.string()), 25 | ('responses', pa.list_(pa.string())), 26 | ('next_response', pa.string()), 27 | pa.field('answer', pa.string(), nullable=True), 28 | ]) 29 | 30 | critique_schema = pa.schema([ 31 | ('instruction', pa.string()), 32 | ('responses', pa.list_(pa.string())), 33 | ('next_response', pa.string()), 34 | pa.field('answer', pa.string(), nullable=True), 35 | ('is_human_response', pa.bool_()), 36 | ('is_solution', pa.bool_()), 37 | ('is_preferred_response', pa.bool_()), 38 | ('rating', pa.int8()), 39 | ]) 40 | 41 | class StepwiseBatch(NamedTuple): 42 | instructions: List[str] 43 | response_lists: List[List[str]] 44 | next_responses: List[str] 45 | answers: List[bool] 46 | is_human_response: List[bool] 47 | 48 | class AnswerOnlyBatch(NamedTuple): 49 | instructions: List[str] 50 | response_lists: List[List[str]] 51 | next_responses: List[str] 52 | answers: List[bool] 53 | 54 | class CritiqueBatch(NamedTuple): 55 | instructions: List[str] 56 | response_lists: List[List[str]] 57 | next_responses: List[str] 58 | answers: List[bool] 59 | is_human_response: List[bool] 60 | is_solution: List[bool] 61 | is_preferred_response: List[bool] 62 | ratings: List[int] 63 | 64 | if __name__ == '__main__': 65 | script_dir = Path(dirname(realpath(__file__))) 66 | repo_root: Path = script_dir.parent 67 | out_dir = repo_root.joinpath('out') 68 | makedirs(out_dir, exist_ok=True) 69 | 70 | data_dir: Path = repo_root.joinpath('prm800k/data') 71 | assert exists(data_dir), 'Expected dir prm800k/data to exist -- you are expected to copy this in yourself from https://github.com/Openai/Prm800k. See prm800k/README.md for details.' 72 | 73 | for data_file in fnmatch.filter(listdir(data_dir), '*.jsonl'): 74 | data_stem = Path(data_file).stem 75 | in_path_jsonl: Path = data_dir.joinpath(data_file) 76 | 77 | out_all: Path = out_dir.joinpath(f'{data_stem}.stepwise-best.parquet') 78 | out_all.unlink(missing_ok=True) 79 | out_answer_only: Path = out_dir.joinpath(f'{data_stem}.solutions-only.parquet') 80 | out_answer_only.unlink(missing_ok=True) 81 | out_critique: Path = out_dir.joinpath(f'{data_stem}.stepwise-critique.parquet') 82 | out_critique.unlink(missing_ok=True) 83 | 84 | with (open(in_path_jsonl, 'r') as file, 85 | ParquetWriter(str(out_all), schema=stepwise_schema) as stepwise_writer, 86 | ParquetWriter(str(out_answer_only), schema=answer_only_schema) as answer_only_writer, 87 | ParquetWriter(str(out_critique), schema=critique_schema) as out_critique_writer): 88 | for line_ix, line in enumerate(file.readlines()): 89 | js: PRMRecord = json.loads(line) 90 | 91 | samples: Iterable[Sample] = make_telescoping_conversation(js) 92 | batch = StepwiseBatch([], [], [], [], []) 93 | instructions, response_lists, next_responses, answers, is_human_responses = batch 94 | final_sample: Optional[Sample] = None 95 | try: 96 | for sample_ix, sample in enumerate(samples): 97 | instruction, responses, next_response, answer, is_human_response = sample 98 | instructions.append(instruction) 99 | response_lists.append(responses) 100 | next_responses.append(next_response) 101 | answers.append(answer) 102 | is_human_responses.append(is_human_response) 103 | if answer is not None: 104 | final_sample = sample 105 | except GiveUp as e: 106 | logger.warning(f'[BestResponseOnly] Record at line {line_ix} gave up, at conversation step {sample_ix+1}') 107 | except Malformed as e: 108 | logger.warning(f'[BestResponseOnly] Record at line {line_ix} was malformed, at conversation step {sample_ix+1}, error was: {e.args[0]}') 109 | 110 | if batch.instructions: 111 | table = pa.Table.from_arrays(list(batch), schema=stepwise_schema) 112 | stepwise_writer.write_table(table) 113 | 114 | if final_sample is not None: 115 | instruction, responses, next_response, answer, _ = final_sample 116 | batch = AnswerOnlyBatch([instruction], [responses], [next_response], [answer]) 117 | table = pa.Table.from_arrays(list(batch), schema=answer_only_schema) 118 | answer_only_writer.write_table(table) 119 | 120 | samples: Iterable[Sample] = make_critiques(js) 121 | batch = CritiqueBatch([], [], [], [], [], [], [], []) 122 | instructions, response_lists, next_responses, answers, is_human_responses, is_solutions, is_preferred_responses, ratings = batch 123 | try: 124 | for sample_ix, sample in enumerate(samples): 125 | instruction, responses, next_response, answer, is_human_response, is_solution, is_preferred_response, rating = sample 126 | instructions.append(instruction) 127 | response_lists.append(responses) 128 | next_responses.append(next_response) 129 | answers.append(answer) 130 | is_human_responses.append(is_human_response) 131 | is_solutions.append(is_solution) 132 | is_preferred_responses.append(is_preferred_response) 133 | ratings.append(rating) 134 | except Malformed as e: 135 | logger.warning(f'[Critic] Record at line {line_ix} was malformed, at conversation step {sample_ix+1}, error was: {e.args[0]}') 136 | table = pa.Table.from_arrays(list(batch), schema=critique_schema) 137 | out_critique_writer.write_table(table) -------------------------------------------------------------------------------- /scripts/read.py: -------------------------------------------------------------------------------- 1 | from pandas import read_parquet, DataFrame 2 | from os.path import dirname, realpath 3 | from pathlib import Path 4 | 5 | if __name__ == '__main__': 6 | script_dir = Path(dirname(realpath(__file__))) 7 | repo_root: Path = script_dir.parent 8 | out_dir = repo_root.joinpath('out') 9 | 10 | # df: DataFrame = read_parquet(str(out_dir.joinpath('phase1_train.parquet'))) 11 | # df: DataFrame = read_parquet(str(out_dir.joinpath('phase1_train.answer_only.parquet'))) 12 | df: DataFrame = read_parquet(str(out_dir.joinpath('phase1_train.critique.parquet'))) 13 | pass # somewhere to put a breakpoint 14 | -------------------------------------------------------------------------------- /src/convert.py: -------------------------------------------------------------------------------- 1 | from typing import Generator, List, Iterable, NamedTuple, Optional 2 | from src.prm800k_record import PRMRecord, Completion, FinishReason 3 | from collections import deque 4 | 5 | class Sample(NamedTuple): 6 | instruction: str 7 | responses: List[str] 8 | next_response: str 9 | answer: Optional[str] 10 | is_human_response: bool 11 | 12 | class CritiqueSample(NamedTuple): 13 | instruction: str 14 | responses: List[str] 15 | next_response: str 16 | answer: Optional[str] 17 | is_human_response: bool 18 | is_solution: bool 19 | is_preferred_response: bool 20 | # -1, 0, 1 21 | rating: int 22 | 23 | class GiveUp(Exception): ... 24 | 25 | class Malformed(Exception): ... 26 | 27 | answer_section = '# Answer\n\n' 28 | answer_delimiter = f'\n\n{answer_section}' 29 | 30 | def make_telescoping_conversation(conversation: PRMRecord) -> Generator[Sample, None, None]: 31 | instruction: str = conversation['question']['problem'] 32 | steps: List[str] = [] 33 | for step in conversation['label']['steps']: 34 | is_last: bool = step is conversation['label']['steps'][-1] 35 | is_solution: bool = is_last and conversation['label']['finish_reason'] == 'solution' 36 | if step['chosen_completion'] is None and step['human_completion'] is None: 37 | if conversation['label']['finish_reason'] in {'give_up', 'found_error'}: 38 | raise GiveUp 39 | else: 40 | raise Malformed(f"No solution was chosen, and yet finish_reason was neither 'give_up' nor 'found_error', finish_reason was: {conversation['label']['finish_reason']}") 41 | preferred_completion: Completion = step['human_completion'] if step['chosen_completion'] is None else step['completions'][step['chosen_completion']] 42 | is_human_response: bool = preferred_completion is step['human_completion'] 43 | completion_text: str = preferred_completion['text'] 44 | if is_solution: 45 | if completion_text.startswith(answer_section): 46 | next_response, answer = '', completion_text[len(answer_section):] 47 | else: 48 | if answer_delimiter not in completion_text: 49 | raise Malformed(f'answer_delimiter <{answer_delimiter}> not detected in completion: <{completion_text}>') 50 | next_response, answer = completion_text.split(answer_delimiter, maxsplit=1) 51 | else: 52 | next_response, answer = completion_text, None 53 | steps.append(next_response) 54 | yield Sample( 55 | instruction=instruction, 56 | responses=steps[:-1], 57 | next_response=next_response, 58 | answer=answer, 59 | is_human_response=is_human_response, 60 | ) 61 | 62 | def make_critiques(conversation: PRMRecord) -> Generator[CritiqueSample, None, None]: 63 | instruction: str = conversation['question']['problem'] 64 | steps: List[str] = [] 65 | for step in conversation['label']['steps']: 66 | is_last: bool = step is conversation['label']['steps'][-1] 67 | has_solution: bool = is_last and conversation['label']['finish_reason'] == 'solution' 68 | preferred_completion: Completion = step['human_completion'] if step['chosen_completion'] is None else step['completions'][step['chosen_completion']] 69 | if preferred_completion is not None: 70 | preferred_completion_text: str = preferred_completion['text'] 71 | steps.append(preferred_completion_text) 72 | completions: List[Completion] = step['completions'] if step['human_completion'] is None else [ 73 | *step['completions'], 74 | step['human_completion'] 75 | ] 76 | if not completions: 77 | raise Malformed('No completions offered for this step') 78 | for completion in completions: 79 | completion_text: str = completion['text'] 80 | if answer_delimiter in completion_text: 81 | next_response, answer = completion_text.split(answer_delimiter, maxsplit=1) 82 | else: 83 | next_response, answer = completion_text, None 84 | is_human_response = completion is step['human_completion'] 85 | yield CritiqueSample( 86 | instruction=instruction, 87 | responses=steps[:-1], 88 | next_response=next_response, 89 | answer=answer, 90 | is_human_response=is_human_response, 91 | is_solution=has_solution and completion is preferred_completion, 92 | is_preferred_response=completion is preferred_completion, 93 | rating=completion['rating'], 94 | ) 95 | 96 | 97 | def get_final_sample(samples: Iterable[Sample]) -> Sample: 98 | last_element, = deque(samples, 1) 99 | return last_element 100 | -------------------------------------------------------------------------------- /src/prm800k_record.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, Optional, Any, Literal, List, TypeAlias 2 | 3 | FinishReason: TypeAlias = Literal['solution', 'give_up', 'found_error'] 4 | 5 | class Question(TypedDict): 6 | problem: str 7 | ground_truth_answer: str 8 | 9 | class HumanCompletion(TypedDict): 10 | text: str 11 | rating: None 12 | source: Literal['human'] 13 | flagged: bool 14 | 15 | class Completion(TypedDict): 16 | text: str 17 | rating: int 18 | flagged: bool 19 | 20 | class Step(TypedDict): 21 | completions: List[Completion] 22 | human_completion: Optional[HumanCompletion] 23 | chosen_completion: Optional[int] 24 | 25 | class Label(TypedDict): 26 | finish_reason: FinishReason 27 | total_time: int 28 | steps: List[Step] 29 | 30 | class PRMRecord(TypedDict): 31 | labeler: str 32 | timestamp: str 33 | generation: Optional[Any] 34 | is_quality_control_question: bool 35 | is_initial_screening_question: bool 36 | question: Question 37 | label: Label 38 | --------------------------------------------------------------------------------