├── .gitignore
├── .vscode
    ├── launch.json
    └── settings.json
├── LICENSE
├── README.md
├── prm800k
    ├── .gitignore
    └── README.md
├── requirements.txt
├── scripts
    ├── convert.py
    └── read.py
└── src
    ├── convert.py
    └── prm800k_record.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .DS_Store
3 | /out/


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "configurations": [
 3 |     {
 4 |       "name": "Python: Convert",
 5 |       "type": "python",
 6 |       "request": "launch",
 7 |       "module": "scripts.convert",
 8 |       "justMyCode": false
 9 |     },
10 |     {
11 |       "name": "Python: Read",
12 |       "type": "python",
13 |       "request": "launch",
14 |       "module": "scripts.read",
15 |       "justMyCode": false
16 |     }
17 |   ]
18 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "python.analysis.extraPaths": ["src"],
3 |   "editor.tabSize": 4,
4 |   "editor.insertSpaces": true
5 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Scott Logic Ltd
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # prm800k-denorm
  2 | 
  3 | This repository is home to a script for converting OpenAI's [PRM800K](https://github.com/openai/prm800k/tree/main) process supervision dataset to a denormalized format in `.parquet` file type, for easier consumption.
  4 | 
  5 | Datasets converted with this tool have been published at [huggingface.co/sl-alex](https://huggingface.co/sl-alex).
  6 | 
  7 | ## Why would I want process supervision?
  8 | 
  9 | See OpenAI's [blog post](https://openai.com/research/improving-mathematical-reasoning-with-process-supervision) and "Let's Verify Step by Step" [paper](https://arxiv.org/abs/2305.20050), but essentially it helps to improve a language model's reasoning and human alignment.
 10 | 
 11 | If you only reward a language model for correct answers, then incentivizes "reward hacking" — reaching correct conclusions via incorrect reasoning. This results in a poor underlying understanding, which fails to generalize, and gives rise to logical mistakes (often called "hallucinations").
 12 | 
 13 | So, rather than the usual "outcome supervision": let's try "process supervision": reward the language model for making good _progress_ towards solutions. This will improve its reasoning.  
 14 | We encourage this, by teaching it to complete maths problems step-by-step, showing its workings.
 15 | 
 16 | Our goal with this repository is to give you access to a step-by-step dataset, so that you can train your language model in this way.
 17 | 
 18 | The dataset is semantic, so that you can template it into your prompt style however works best for you.
 19 | 
 20 | ## Dataset types
 21 | 
 22 | In total, the datasets are:
 23 | 
 24 | - [Solutions only](https://huggingface.co/datasets/sl-alex/openai-prm800k-solutions-only)
 25 | - [Stepwise best](https://huggingface.co/datasets/sl-alex/openai-prm800k-stepwise-best)
 26 | - [Stepwise critic](https://huggingface.co/datasets/sl-alex/openai-prm800k-stepwise-critic)
 27 | 
 28 | _You probably just want one of the three combined datasets rather than one of the twelve per-phase splits._
 29 | 
 30 | ## Loading a dataset
 31 | 
 32 | You can load a dataset and iterate through it, like so:
 33 | 
 34 | ```python
 35 | from datasets import load_dataset, DatasetDict, Dataset
 36 | # dd: DatasetDict = load_dataset("sl-alex/openai-prm800k-solutions-only")
 37 | # dd: DatasetDict = load_dataset("sl-alex/openai-prm800k-stepwise-best")
 38 | dd: DatasetDict = load_dataset("sl-alex/openai-prm800k-stepwise-critic")
 39 | 
 40 | d: Dataset = dd['train']
 41 | 
 42 | prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. Show your reasoning step-by-step, and indicate your final answer under a heading titled Answer.
 43 | 
 44 | ### Instruction:
 45 | {instruction}
 46 | 
 47 | ### Response:
 48 | {response_history}"""
 49 | 
 50 | answer_template = """{response}
 51 | 
 52 | # Answer:
 53 | {answer}"""
 54 | 
 55 | for record in zip(
 56 |   d['instruction'],
 57 |   d['responses'],
 58 |   d['next_response'],
 59 |   d['answer'],
 60 |   d['is_human_response'],
 61 |   d['is_solution'],
 62 |   d['is_preferred_response'],
 63 |   d['rating']
 64 | ):
 65 |   instruction, responses, next_response, answer, is_human_response, is_solution, is_preferred_response, rating = record
 66 |   prompt = prompt_template.format(
 67 |     instruction=instruction,
 68 |     response_history=''.join((f'{response}\n' for response in responses)),
 69 |   )
 70 |   completion=next_response if answer is None else answer_template.format(response=next_response, answer=answer)
 71 |   print(f'Prompt:\n<{prompt}>')
 72 |   print(f'Completion:\n<{completion}>')
 73 | ```
 74 | 
 75 | It should print something like:
 76 | 
 77 | ```
 78 | Prompt:
 79 | <Below is an instruction that describes a task. Write a response that appropriately completes the request. Show your reasoning step-by-step, and indicate your final answer under a heading titled Answer.
 80 | 
 81 | ### Instruction:
 82 | How many seconds are in 7.8 minutes?
 83 | 
 84 | ### Response:
 85 | >
 86 | Completion:
 87 | <7.8 minutes is the same as 7 minutes and 0.8 minutes.>
 88 | ```
 89 | 
 90 | ## Dataset description
 91 | 
 92 | ### Solutions only
 93 | 
 94 | ```
 95 | # the problem statement (e.g. "what is x, given 2x = 1?")
 96 | instruction: str
 97 | 
 98 | # history of responses that the bot emitted before reaching this final step
 99 | responses: List[str]
100 | 
101 | # response emitted for the final step in the conversation.
102 | # it is accompanied by an "answer" subsection, which we capture separately
103 | next_response: str
104 | 
105 | # answer subsection of the final response
106 | answer: str
107 | ```
108 | 
109 | Only conversations which achieved `"finish_reason": "solution"` are retained.
110 | 
111 | We provide only one record per conversation — the final step.  
112 | This record contains a list of all the steps which led to that final step. So you have the whole conversation available.
113 | 
114 | You could use this to create the following kind of training data:
115 | 
116 | - **Source**: initial instruction
117 | - **Target**: all steps of the conversation, including the final step and answer
118 | 
119 | #### Usage
120 | 
121 | Example of how to take this sample and turn it into a source->target training pair:
122 | 
123 | sample:  
124 | ```
125 | instruction: "What is x, given 2x + 1 = 2?"
126 | 
127 | responses: [
128 |     "Okay, let's first rearrange the equation to isolate the x.",
129 |     "2x = 2 - 1, which simplifes to 2x = 1",
130 |     "Now that the x term is isolated, let's divide both sides to eliminate its coefficient."
131 | ]
132 | 
133 | next_response: "x = 1/2"
134 | 
135 | answer: "0.5"
136 | ```
137 | 
138 | Example Alpaca-style source prompt:
139 | 
140 | ```
141 | Below is an instruction that describes a task. Write responses which progress toward a solution to the request. Indicate your final answer under a heading Final Answer.
142 | 
143 | ### Instruction:
144 | What is x, given 2x + 1 = 2?
145 | 
146 | ### Response:
147 | Okay, let's first rearrange the equation to isolate the x.
148 | 
149 | ### Response:
150 | 2x = 2 - 1, which simplifes to 2x = 1
151 | 
152 | ### Response:
153 | Now that the x term is isolated, let's divide both sides to eliminate its coefficient.
154 | 
155 | ### Response:
156 | ```
157 | 
158 | Example Alpaca-style target prompt:
159 | 
160 | ```
161 | x = 1/2
162 | 
163 | # Answer
164 | 0.5
165 | ```
166 | 
167 | ### Stepwise best
168 | 
169 | ```
170 | # the problem statement (e.g. "what is x, given 2x = 1?")
171 | instruction: str
172 | 
173 | # any responses that were emitted in prior conversation steps
174 | # empty list means it's the first conversation turn
175 | responses: List[str]
176 | 
177 | # the response emitted for the current step in the conversation
178 | # if this is the final response in the conversation: the response may be accompanied by
179 | # an answer subsection. we separate this out into the `answer` field below.
180 | next_response: str
181 | 
182 | # usually None, but if this is filled in: the final response of the conversation has occurred,
183 | # and has a subsection declaring its overall answer. this field captures that overall answer.
184 | answer: Optional[str]
185 | 
186 | # often, there are conversation steps in which the human evaluator had to intervene,
187 | # as no generated response was satisfactory.
188 | is_human_response: bool
189 | ```
190 | 
191 | We provide a record per productive conversation turn.  
192 | Whichever is the preferred completion for that conversation turn (i.e. the `chosen_completion` if a bot response was preferred, or a `human_completion` otherwise), is the one we use.
193 | 
194 | You could use this to create the following kind of training data:
195 | 
196 | **Source**: initial instruction + 0-to-many steps of conversation so far
197 | **Target**: next step of conversation (doesn't necessarily get you to a complete solution)
198 | 
199 | #### Usage
200 | 
201 | Example of how to take this sample and turn it into a source->target training pair:
202 | 
203 | sample:  
204 | ```
205 | instruction: "What is x, given 2x + 1 = 2?"
206 | 
207 | responses: [
208 |     "Okay, let's first rearrange the equation to isolate the x.",
209 |     "2x = 2 - 1, which simplifes to 2x = 1",
210 | ]
211 | 
212 | next_response: "Now that the x term is isolated, let's divide both sides to eliminate its coefficient."
213 | 
214 | answer: None
215 | 
216 | is_human_response: False
217 | ```
218 | 
219 | Example Alpaca-style source prompt:
220 | 
221 | ```
222 | Below is an instruction that describes a task. Write responses which progress toward a solution to the request. Indicate your final answer under a heading Final Answer.
223 | 
224 | ### Instruction:
225 | What is x, given 2x + 1 = 2?
226 | 
227 | ### Response:
228 | Okay, let's first rearrange the equation to isolate the x.
229 | 
230 | ### Response:
231 | 2x = 2 - 1, which simplifes to 2x = 1
232 | 
233 | ### Response:
234 | ```
235 | 
236 | Example Alpaca-style target prompt:
237 | 
238 | ```
239 | Now that the x term is isolated, let's divide both sides to eliminate its coefficient.
240 | ```
241 | 
242 | Notice how this time there was no "answer" subsection, because this conversation step does not propose a solution. It can happen though, so check whether answer is `None`.
243 | 
244 | ### Stepwise critique
245 | 
246 | ```
247 | # the problem statement (e.g. "what is x, given 2x = 1?")
248 | instruction: str
249 | 
250 | # any responses that were emitted in prior conversation steps
251 | # empty list means it's the first conversation turn
252 | responses: List[str]
253 | 
254 | # the response emitted for the current step in the conversation
255 | # if this is the final response in the conversation: the response may be accompanied by
256 | # an answer subsection. we separate this out into the `answer` field below.
257 | next_response: str
258 | 
259 | # usually None, but if this is filled in: the final response of the conversation has occurred,
260 | # and has a subsection declaring its overall answer. this field captures that overall answer.
261 | answer: Optional[str]
262 | 
263 | # often, there are conversation steps in which the human evaluator had to intervene,
264 | # as no generated response was satisfactory.
265 | is_human_response: bool
266 | 
267 | # whether an answer was provided **and** was correct
268 | is_solution: bool
269 | 
270 | # tells us if this completion was rated as the best option by the human evaluator
271 | is_preferred_response: bool
272 | 
273 | #   -1 = counterproductive
274 | #    0 = unproductive
275 | #    1 = productive
276 | # None = human answer (you can treat this as 1)
277 | rating: Optional[int]
278 | ```
279 | 
280 | We provide a record per proposed completion, per conversation turn.  
281 | This includes human ratings.
282 | 
283 | You could use this to create the following kind of training data:
284 | 
285 | **Source**: initial instruction + 0-to-many steps of conversation so far
286 | **Target**: a possible completion and rating
287 | 
288 | The data here includes both good and bad rated samples.
289 | 
290 | You could just keep the good ones. That would be similar to what Stepwise best gives you, but instead of "best" it gives you a wider "any answer which got a good rating, even if it wasn't the best one". So, more data (lower quality).
291 | 
292 | But an even better usage is to use this to train a critic. @crowsonkb has demonstated that if you have a classifier, you can [guide your sampling](https://gist.github.com/crowsonkb/af6135392cc1627f40b03456aa90810c) of the language model, generating a few candidate next-tokens and picking the one that your classifier prefers.  
293 | So you could use this to train a critic, to guide another language model to employ more reasoning in its responses.
294 | 
295 | ### Usage
296 | 
297 | You'd format this the same as stepwise best.
298 | 
299 | ## Setup
300 | 
301 | _Note: if you're happy with the [published datasets](https://huggingface.co/sl-alex), then there's no need for you to get this repository set up yourself — these Setup instructions are for tinkerers who wish to try the export themselves, or change the format_.
302 | 
303 | ### Get OpenAI data
304 | 
305 | Copy the PRM800K and MATH datasets into the `prm800k` directory, in accordance with [`prm800k/README.md`](prm800k/README.md).
306 | 
307 | ### Install Python dependencies
308 | 
309 | ```bash
310 | pip install -r requirements.txt
311 | ```
312 | 
313 | ### Run the converter
314 | 
315 | ```bash
316 | python -m scripts.convert
317 | ```
318 | 
319 | This should output some .parquet files to a directory `out`, at the root of the repository.
320 | 


--------------------------------------------------------------------------------
/prm800k/.gitignore:
--------------------------------------------------------------------------------
1 | /data/
2 | /math_splits/


--------------------------------------------------------------------------------
/prm800k/README.md:
--------------------------------------------------------------------------------
1 | Please copy into this directory, some resources from [`openai/prm800k`](https://github.com/openai/prm800k):
2 | 
3 | - [`data/`](https://github.com/openai/prm800k/tree/main/prm800k/data)
4 | - [`math_splits/`](https://github.com/openai/prm800k/tree/main/prm800k/math_splits)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyarrow
2 | pandas


--------------------------------------------------------------------------------
/scripts/convert.py:
--------------------------------------------------------------------------------
  1 | from os import makedirs, listdir
  2 | from os.path import dirname, realpath, exists
  3 | from pathlib import Path
  4 | import pyarrow as pa
  5 | from pyarrow.parquet import ParquetWriter
  6 | import json
  7 | from typing import Iterable, NamedTuple, List, Optional
  8 | from logging import getLogger, Logger
  9 | from src.prm800k_record import PRMRecord
 10 | from src.convert import make_telescoping_conversation, make_critiques, Sample, GiveUp, Malformed
 11 | import fnmatch
 12 | 
 13 | logger: Logger = getLogger(__file__)
 14 | 
 15 | stepwise_schema = pa.schema([
 16 |     ('instruction', pa.string()),
 17 |     ('responses', pa.list_(pa.string())),
 18 |     ('next_response', pa.string()),
 19 |     pa.field('answer', pa.string(), nullable=True),
 20 |     ('is_human_response', pa.bool_()),
 21 | ])
 22 | 
 23 | answer_only_schema = pa.schema([
 24 |     ('instruction', pa.string()),
 25 |     ('responses', pa.list_(pa.string())),
 26 |     ('next_response', pa.string()),
 27 |     pa.field('answer', pa.string(), nullable=True),
 28 | ])
 29 | 
 30 | critique_schema = pa.schema([
 31 |     ('instruction', pa.string()),
 32 |     ('responses', pa.list_(pa.string())),
 33 |     ('next_response', pa.string()),
 34 |     pa.field('answer', pa.string(), nullable=True),
 35 |     ('is_human_response', pa.bool_()),
 36 |     ('is_solution', pa.bool_()),
 37 |     ('is_preferred_response', pa.bool_()),
 38 |     ('rating', pa.int8()),
 39 | ])
 40 | 
 41 | class StepwiseBatch(NamedTuple):
 42 |     instructions: List[str]
 43 |     response_lists: List[List[str]]
 44 |     next_responses: List[str]
 45 |     answers: List[bool]
 46 |     is_human_response: List[bool]
 47 | 
 48 | class AnswerOnlyBatch(NamedTuple):
 49 |     instructions: List[str]
 50 |     response_lists: List[List[str]]
 51 |     next_responses: List[str]
 52 |     answers: List[bool]
 53 | 
 54 | class CritiqueBatch(NamedTuple):
 55 |     instructions: List[str]
 56 |     response_lists: List[List[str]]
 57 |     next_responses: List[str]
 58 |     answers: List[bool]
 59 |     is_human_response: List[bool]
 60 |     is_solution: List[bool]
 61 |     is_preferred_response: List[bool]
 62 |     ratings: List[int]
 63 | 
 64 | if __name__ == '__main__':
 65 |     script_dir = Path(dirname(realpath(__file__)))
 66 |     repo_root: Path = script_dir.parent
 67 |     out_dir = repo_root.joinpath('out')
 68 |     makedirs(out_dir, exist_ok=True)
 69 | 
 70 |     data_dir: Path = repo_root.joinpath('prm800k/data')
 71 |     assert exists(data_dir), 'Expected dir prm800k/data to exist -- you are expected to copy this in yourself from https://github.com/Openai/Prm800k. See prm800k/README.md for details.'
 72 | 
 73 |     for data_file in fnmatch.filter(listdir(data_dir), '*.jsonl'):
 74 |         data_stem = Path(data_file).stem
 75 |         in_path_jsonl: Path = data_dir.joinpath(data_file)
 76 | 
 77 |         out_all: Path = out_dir.joinpath(f'{data_stem}.stepwise-best.parquet')
 78 |         out_all.unlink(missing_ok=True)
 79 |         out_answer_only: Path = out_dir.joinpath(f'{data_stem}.solutions-only.parquet')
 80 |         out_answer_only.unlink(missing_ok=True)
 81 |         out_critique: Path = out_dir.joinpath(f'{data_stem}.stepwise-critique.parquet')
 82 |         out_critique.unlink(missing_ok=True)
 83 | 
 84 |         with (open(in_path_jsonl, 'r') as file,
 85 |             ParquetWriter(str(out_all), schema=stepwise_schema) as stepwise_writer,
 86 |             ParquetWriter(str(out_answer_only), schema=answer_only_schema) as answer_only_writer,
 87 |             ParquetWriter(str(out_critique), schema=critique_schema) as out_critique_writer):
 88 |             for line_ix, line in enumerate(file.readlines()):
 89 |                 js: PRMRecord = json.loads(line)
 90 | 
 91 |                 samples: Iterable[Sample] = make_telescoping_conversation(js)
 92 |                 batch = StepwiseBatch([], [], [], [], [])
 93 |                 instructions, response_lists, next_responses, answers, is_human_responses = batch
 94 |                 final_sample: Optional[Sample] = None
 95 |                 try:
 96 |                     for sample_ix, sample in enumerate(samples):
 97 |                         instruction, responses, next_response, answer, is_human_response = sample
 98 |                         instructions.append(instruction)
 99 |                         response_lists.append(responses)
100 |                         next_responses.append(next_response)
101 |                         answers.append(answer)
102 |                         is_human_responses.append(is_human_response)
103 |                         if answer is not None:
104 |                             final_sample = sample
105 |                 except GiveUp as e:
106 |                     logger.warning(f'[BestResponseOnly] Record at line {line_ix} gave up, at conversation step {sample_ix+1}')
107 |                 except Malformed as e:
108 |                     logger.warning(f'[BestResponseOnly] Record at line {line_ix} was malformed, at conversation step {sample_ix+1}, error was: {e.args[0]}')
109 |                 
110 |                 if batch.instructions:
111 |                     table = pa.Table.from_arrays(list(batch), schema=stepwise_schema)
112 |                     stepwise_writer.write_table(table)
113 | 
114 |                 if final_sample is not None:
115 |                     instruction, responses, next_response, answer, _ = final_sample
116 |                     batch = AnswerOnlyBatch([instruction], [responses], [next_response], [answer])
117 |                     table = pa.Table.from_arrays(list(batch), schema=answer_only_schema)
118 |                     answer_only_writer.write_table(table)
119 |                 
120 |                 samples: Iterable[Sample] = make_critiques(js)
121 |                 batch = CritiqueBatch([], [], [], [], [], [], [], [])
122 |                 instructions, response_lists, next_responses, answers, is_human_responses, is_solutions, is_preferred_responses, ratings = batch
123 |                 try:
124 |                     for sample_ix, sample in enumerate(samples):
125 |                         instruction, responses, next_response, answer, is_human_response, is_solution, is_preferred_response, rating = sample
126 |                         instructions.append(instruction)
127 |                         response_lists.append(responses)
128 |                         next_responses.append(next_response)
129 |                         answers.append(answer)
130 |                         is_human_responses.append(is_human_response)
131 |                         is_solutions.append(is_solution)
132 |                         is_preferred_responses.append(is_preferred_response)
133 |                         ratings.append(rating)
134 |                 except Malformed as e:
135 |                     logger.warning(f'[Critic] Record at line {line_ix} was malformed, at conversation step {sample_ix+1}, error was: {e.args[0]}')
136 |                 table = pa.Table.from_arrays(list(batch), schema=critique_schema)
137 |                 out_critique_writer.write_table(table)


--------------------------------------------------------------------------------
/scripts/read.py:
--------------------------------------------------------------------------------
 1 | from pandas import read_parquet, DataFrame
 2 | from os.path import dirname, realpath
 3 | from pathlib import Path
 4 | 
 5 | if __name__ == '__main__':
 6 |     script_dir = Path(dirname(realpath(__file__)))
 7 |     repo_root: Path = script_dir.parent
 8 |     out_dir = repo_root.joinpath('out')
 9 | 
10 |     # df: DataFrame = read_parquet(str(out_dir.joinpath('phase1_train.parquet')))
11 |     # df: DataFrame = read_parquet(str(out_dir.joinpath('phase1_train.answer_only.parquet')))
12 |     df: DataFrame = read_parquet(str(out_dir.joinpath('phase1_train.critique.parquet')))
13 |     pass # somewhere to put a breakpoint
14 | 


--------------------------------------------------------------------------------
/src/convert.py:
--------------------------------------------------------------------------------
  1 | from typing import Generator, List, Iterable, NamedTuple, Optional
  2 | from src.prm800k_record import PRMRecord, Completion, FinishReason
  3 | from collections import deque
  4 | 
  5 | class Sample(NamedTuple):
  6 |     instruction: str
  7 |     responses: List[str]
  8 |     next_response: str
  9 |     answer: Optional[str]
 10 |     is_human_response: bool
 11 | 
 12 | class CritiqueSample(NamedTuple):
 13 |     instruction: str
 14 |     responses: List[str]
 15 |     next_response: str
 16 |     answer: Optional[str]
 17 |     is_human_response: bool
 18 |     is_solution: bool
 19 |     is_preferred_response: bool
 20 |     # -1, 0, 1
 21 |     rating: int
 22 | 
 23 | class GiveUp(Exception): ...
 24 | 
 25 | class Malformed(Exception): ...
 26 | 
 27 | answer_section = '# Answer\n\n'
 28 | answer_delimiter = f'\n\n{answer_section}'
 29 | 
 30 | def make_telescoping_conversation(conversation: PRMRecord) -> Generator[Sample, None, None]:
 31 |     instruction: str = conversation['question']['problem']
 32 |     steps: List[str] = []
 33 |     for step in conversation['label']['steps']:
 34 |         is_last: bool = step is conversation['label']['steps'][-1]
 35 |         is_solution: bool = is_last and conversation['label']['finish_reason'] == 'solution'
 36 |         if step['chosen_completion'] is None and step['human_completion'] is None:
 37 |             if conversation['label']['finish_reason'] in {'give_up', 'found_error'}:
 38 |                 raise GiveUp
 39 |             else:
 40 |                 raise Malformed(f"No solution was chosen, and yet finish_reason was neither 'give_up' nor 'found_error', finish_reason was: {conversation['label']['finish_reason']}")
 41 |         preferred_completion: Completion = step['human_completion'] if step['chosen_completion'] is None else step['completions'][step['chosen_completion']]
 42 |         is_human_response: bool = preferred_completion is step['human_completion']
 43 |         completion_text: str = preferred_completion['text']
 44 |         if is_solution:
 45 |             if completion_text.startswith(answer_section):
 46 |                 next_response, answer = '', completion_text[len(answer_section):]
 47 |             else:
 48 |                 if answer_delimiter not in completion_text:
 49 |                     raise Malformed(f'answer_delimiter <{answer_delimiter}> not detected in completion: <{completion_text}>')
 50 |                 next_response, answer = completion_text.split(answer_delimiter, maxsplit=1)
 51 |         else:
 52 |             next_response, answer = completion_text, None
 53 |         steps.append(next_response)
 54 |         yield Sample(
 55 |             instruction=instruction,
 56 |             responses=steps[:-1],
 57 |             next_response=next_response,
 58 |             answer=answer,
 59 |             is_human_response=is_human_response,
 60 |         )
 61 | 
 62 | def make_critiques(conversation: PRMRecord) -> Generator[CritiqueSample, None, None]:
 63 |     instruction: str = conversation['question']['problem']
 64 |     steps: List[str] = []
 65 |     for step in conversation['label']['steps']:
 66 |         is_last: bool = step is conversation['label']['steps'][-1]
 67 |         has_solution: bool = is_last and conversation['label']['finish_reason'] == 'solution'
 68 |         preferred_completion: Completion = step['human_completion'] if step['chosen_completion'] is None else step['completions'][step['chosen_completion']]
 69 |         if preferred_completion is not None:
 70 |             preferred_completion_text: str = preferred_completion['text']
 71 |             steps.append(preferred_completion_text)
 72 |         completions: List[Completion] = step['completions'] if step['human_completion'] is None else [
 73 |             *step['completions'],
 74 |             step['human_completion']
 75 |         ]
 76 |         if not completions:
 77 |             raise Malformed('No completions offered for this step')
 78 |         for completion in completions:
 79 |             completion_text: str = completion['text']
 80 |             if answer_delimiter in completion_text:
 81 |                 next_response, answer = completion_text.split(answer_delimiter, maxsplit=1)
 82 |             else:
 83 |                 next_response, answer = completion_text, None
 84 |             is_human_response = completion is step['human_completion']
 85 |             yield CritiqueSample(
 86 |                 instruction=instruction,
 87 |                 responses=steps[:-1],
 88 |                 next_response=next_response,
 89 |                 answer=answer,
 90 |                 is_human_response=is_human_response,
 91 |                 is_solution=has_solution and completion is preferred_completion,
 92 |                 is_preferred_response=completion is preferred_completion,
 93 |                 rating=completion['rating'],
 94 |             )
 95 | 
 96 | 
 97 | def get_final_sample(samples: Iterable[Sample]) -> Sample:
 98 |     last_element, = deque(samples, 1)
 99 |     return last_element
100 | 


--------------------------------------------------------------------------------
/src/prm800k_record.py:
--------------------------------------------------------------------------------
 1 | from typing import TypedDict, Optional, Any, Literal, List, TypeAlias
 2 | 
 3 | FinishReason: TypeAlias = Literal['solution', 'give_up', 'found_error']
 4 | 
 5 | class Question(TypedDict):
 6 |     problem: str
 7 |     ground_truth_answer: str
 8 | 
 9 | class HumanCompletion(TypedDict):
10 |     text: str
11 |     rating: None
12 |     source: Literal['human']
13 |     flagged: bool
14 | 
15 | class Completion(TypedDict):
16 |     text: str
17 |     rating: int
18 |     flagged: bool
19 | 
20 | class Step(TypedDict):
21 |     completions: List[Completion]
22 |     human_completion: Optional[HumanCompletion]
23 |     chosen_completion: Optional[int]
24 | 
25 | class Label(TypedDict):
26 |     finish_reason: FinishReason
27 |     total_time: int
28 |     steps: List[Step]
29 | 
30 | class PRMRecord(TypedDict):
31 |     labeler: str
32 |     timestamp: str
33 |     generation: Optional[Any]
34 |     is_quality_control_question: bool
35 |     is_initial_screening_question: bool
36 |     question: Question
37 |     label: Label
38 | 


--------------------------------------------------------------------------------