├── synthesize_deep_reasoning
    ├── synthesize.sh
    ├── server.sh
    ├── config.yaml
    ├── helper.py
    ├── model_utils.py
    └── synthesize.py
├── LICENSE
├── README.md
└── .gitignore


/synthesize_deep_reasoning/synthesize.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | export HOST_IP=0.0.0.0
3 | model2=${model2:-""}
4 | cd ${workdir}
5 | python synthesize.py --port $port --rank ${rank} --total-ranks ${total} --config_file $cname --model $model --posterior_model $model2 # port rank total
6 | 


--------------------------------------------------------------------------------
/synthesize_deep_reasoning/server.sh:
--------------------------------------------------------------------------------
1 | 
2 | set -ex
3 | export HOST_IP=0.0.0.0
4 | cname=/path/to/config.yaml
5 | 
6 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server --model $model --port $port --disable-log-requests --max-model-len 32000 -tp 4 --gpu-memory-utilization 0.6 --trust-remote-code &  
7 | CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server --model $model2 --port 2$port --disable-log-requests --max-model-len 32000 -tp 2 --gpu-memory-utilization 0.6 --trust-remote-code &
8 | 


--------------------------------------------------------------------------------
/synthesize_deep_reasoning/config.yaml:
--------------------------------------------------------------------------------
 1 | input:
 2 |   file_path: '/path/to/QAcollection.json'
 3 | 
 4 | output:
 5 |   file_prefix: '/path/to/output/file/folder/'
 6 | 
 7 | processing:
 8 |   stop_thresh: 0.25
 9 |   max_step: 10
10 |   num_rollouts: 1
11 |   num_expansion: 2
12 | 
13 | model:
14 |   # supported model_types: "hf", "openai", "anthropic", "vllm"
15 |   model_type: "vllm_server"
16 |   model_name: "/path/to/Qwen2___5-32B-Instruct"
17 |   model_url: "http://127.0.0.1:8701/v1/completions"
18 |   model_args:
19 |     beamsearch: 0
20 |     port: 5757
21 |     max_tokens: 8000
22 |     top_k: 40
23 |     top_p: 0.85
24 |     temperature_range: [0.8, 0.8]
25 |   prompt_type: "tokenizer"
26 | 
27 | judge_model:
28 |   # supported model_types: "hf", "openai", "anthropic", "vllm"
29 |   use: true
30 |   model_type: "vllm_server"
31 |   model_name: "/path/to/model/for/perplexity"
32 |   model_url: "http://127.0.0.1:8701/v1/completions"
33 |   model_args:
34 |     beamsearch: 0
35 |     port: 5757
36 |     max_tokens: 100
37 |     top_k: -1
38 |     top_p: 1
39 |     temperature_range: [1.0, 1.0]
40 |   prompt_type: "tokenizer"
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 TIGER Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # REverse-Engineered Reasoning for Open-Ended Generation
 2 | The official code of "REverse-Engineered Reasoning for Open-Ended Generation"
 3 | 
 4 | ## Release Progress
 5 | - [x] Deep Reasoning Synthesis
 6 | 
 7 | - [ ] evaluation
 8 | 
 9 | - [ ] VeRL based Distributed SFT training
10 | 
11 | ## Synthesis of Deep Reasoning
12 | cd folder: `synthesize_deep_reasoning`
13 | 
14 | - **Step 0: Update the config.**
15 | 
16 |   check `config.yaml`:
17 |     ```
18 |       stop_thresh: 0.25 # PPL stopping criterion
19 |       max_step: 10 # max-step stopping criterion
20 |       num_rollouts: 1 # num initial thinking rollouts each query, not tested
21 |       num_expansion: 2 # num expanded node for each segment edits
22 |       file_path: '/path/to/QAcollection.json'
23 |       file_prefix: '/path/to/output/file/folder/'
24 |     ```
25 |   Json format: a list of dicts, where each dict has three keys, `question`, `solution`, `index`
26 | 
27 | - **Step 1: Start the vLLM server.**
28 |   ```bash 
29 |   export model=/path/to/generator
30 |   export model2=/path/to/basemodel/for/PPL
31 |   bash server.sh
32 |   ```
33 |   We use Qwen2.5-32B-Instruct as the generator, and Qwen3-8B-Base as the model for computing perplexity. We find it faster if we amortize the PPL computation to a smaller model. 
34 | 
35 | - **Step 2: Run the Deep Reasoning Synthesis with Ray-scheduled multi-workers.**
36 |   ```
37 |   export workdir=${pwd}
38 |   export model=/path/to/generator
39 |   export model2=/path/to/basemodel/for/PPL
40 |   export port=2233
41 |   export rank=0 
42 |   export total=1
43 |   export cname=/path/to/config
44 |   bash synthesis.sh
45 |   ```
46 |   The synthesized trajectories will be dumped to the `file_prefix` path. 
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 


--------------------------------------------------------------------------------
/synthesize_deep_reasoning/helper.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | 
  3 | class Node:
  4 |     def __init__(self, ref, raw_q, gold=None, info=None):
  5 |         if info is not None:
  6 |             self.memory = info 
  7 |         else: self.memory = {}
  8 |         self.memory.update(dict(q=raw_q, ref=ref,history=[]))
  9 |         self.ref = ref
 10 |         self.raw_q = raw_q
 11 |         
 12 | def segment_response(inp, sep='\n\n'):
 13 |     segments = []
 14 |     temp = ''
 15 |     i = 0
 16 |     n = len(inp)
 17 |     
 18 |     while i < n:
 19 |         if inp[i:i+2] == '\\[':
 20 |             # Start of a LaTeX block, collect until we find '\\]'
 21 |             j = i + 2
 22 |             while j < n and inp[j:j+2] != '\\]':
 23 |                 j += 1
 24 |             # Add the LaTeX block including '\\[' and '\\]'
 25 |             # segments.append(inp[i:j+2])
 26 |             temp += inp[i:j+2]
 27 |             i = j + 2  # Move past '\\]'
 28 |         elif inp[i:i+sep.count('\n')] == sep:
 29 |             segments.append(temp)
 30 |             temp = ''
 31 |             j = i+1
 32 |             while j<n and inp[j].strip()=='':
 33 |                 j += 1
 34 |             segments.append(inp[i:j])
 35 |             i = j
 36 |             
 37 |         else:
 38 |             # Add character to current segment
 39 |             temp += inp[i]
 40 |             i += 1
 41 |     
 42 |     # If there's any remaining text, add it
 43 |     if temp:
 44 |         segments.append(temp)
 45 |     # return segments
 46 |     
 47 |     final = []
 48 |     buffer = []
 49 |     for seg in segments:
 50 |         if seg.strip()=='': 
 51 |             buffer.append(seg)
 52 |         elif len(seg)<100: # if previous is quite short, append with it
 53 |             buffer.append(seg)
 54 |         else:
 55 |             if len(buffer)>0: 
 56 |                 prefix = "".join(buffer)
 57 |                 buffer = []
 58 |                 seg = prefix + seg 
 59 |             final.append(seg)
 60 |     if len(buffer)>0: 
 61 |         prefix = "".join(buffer)
 62 |         buffer = []
 63 |         if final: final[-1] += prefix
 64 |         else: final = [prefix]
 65 |     return final
 66 | 
 67 | def load_config(config_path):
 68 |     """
 69 |     Load configuration from a YAML file.
 70 | 
 71 |     Args:
 72 |         config_path (str): Path to the YAML configuration file.
 73 | 
 74 |     Returns:
 75 |         dict: A dictionary containing the configuration.
 76 |     """
 77 |     with open(config_path, "r") as file:
 78 |         return yaml.safe_load(file)
 79 | 
 80 | 
 81 | 
 82 | def make_prompt(tokenizer, messages):
 83 |     prompt = tokenizer.apply_chat_template(
 84 |         messages, tokenize=False, add_generation_prompt=True
 85 |     )
 86 |     # prompt = prompt.split('<think>')[0]
 87 |     if prompt.strip().endswith("<think>"):
 88 |         prompt = prompt.strip()[:-len("<think>")] # split('<think>')[0]
 89 |     return prompt
 90 | 
 91 | 
 92 | def breakdown_steps(a):
 93 |     steps = segment_response(a)
 94 |     if len(steps)<=1:
 95 |         steps = segment_response(a, '\n')
 96 |     final = ""
 97 |     allsteps = []
 98 |     for idx, step in enumerate(steps):
 99 |         allsteps.append(f"<Step{idx+1}> {step}\n")
100 |         final += allsteps[-1]
101 |     return final, allsteps, steps
102 | 
103 | 
104 | def segment_offsets(offsets, segments, logps):
105 |     """
106 |     Splits the 'offsets' list into sublists based on the cumulative sums of 'segments'.
107 | 
108 |     Args:
109 |         offsets (list[int]): A sorted list of integers.
110 |         segments (list[int]): A list of segment lengths.
111 | 
112 |     Returns:
113 |         list[list[int]]: A list of lists, where each sublist contains offsets
114 |                          belonging to the corresponding segment.
115 |     """
116 |     
117 |     # Calculate the cumulative sum of segments to get the upper bounds.
118 |     # The first element is the length of the first segment, the second is the
119 |     # sum of the first two, and so on.
120 |     segment_boundaries = np.cumsum(segments)
121 |     
122 |     # This will hold the final list of lists
123 |     result = []
124 |     
125 |     # Pointer for the current position in the offsets list
126 |     offset_idx = 0 
127 |     
128 |     # This will be the starting boundary for each segment's range.
129 |     # It starts at 0 for the first segment.
130 |     lower_bound = 0
131 | 
132 |     # Iterate through each segment's upper boundary
133 |     for upper_bound in segment_boundaries:
134 |         
135 |         # This sublist will store the offsets for the current segment
136 |         current_segment_offsets = []
137 |         
138 |         # Go through the offsets list starting from where we last left off
139 |         while offset_idx < len(offsets) and offsets[offset_idx] < upper_bound:
140 |             current_segment_offsets.append(offsets[offset_idx])
141 |             offset_idx += 1
142 |             
143 |         result.append(current_segment_offsets)
144 |         
145 |         # The next segment's range will start from the end of the current one
146 |         lower_bound = upper_bound
147 |         
148 |     token_belongs_to_segment = result 
149 |     seg_logp_list = [] # list of list of logps for each segment
150 |     cnt = 0
151 |     for seg_i, token_included in enumerate(token_belongs_to_segment):
152 |         num_tokens = len(token_included)
153 |         assert num_tokens>0
154 |         seg_logp_list.append(logps[cnt:cnt+num_tokens])
155 |         cnt += num_tokens
156 |     return seg_logp_list, token_belongs_to_segment
157 | 
158 | 
159 | def equals(a, b):
160 |     flag = a==b
161 |     if not flag:
162 |         flag = False 
163 |         try: 
164 |             xx = eval(a)
165 |             yy = eval(str(b))
166 |             flag = abs(xx-yy)<1e-4 
167 |         except:
168 |             pass 
169 |         
170 |     return flag 
171 | 


--------------------------------------------------------------------------------
/synthesize_deep_reasoning/model_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import AutoTokenizer, AutoModelForCausalLM
  3 | import random
  4 | import os
  5 | from transformers import set_seed
  6 | import requests
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed
  8 | from tqdm import tqdm
  9 | from typing import List
 10 | import pdb
 11 | import copy
 12 | import time
 13 | import numpy as np
 14 | import json
 15 | # Set your Hugging Face token here
 16 | os.environ["HUGGINGFACE_HUB_TOKEN"] = "hf_yourkey"
 17 | 
 18 | # For reproducibility
 19 | SEED = 1234
 20 | set_seed(SEED)
 21 | random.seed(42)
 22 | 
 23 | class LM:
 24 |     def __init__(self, model_name: str = "Qwen/Qwen2.5-Math-7B-Instruct", model_type: str = "hf", model_url="", num_rollouts: int = 5, tokenizer=None, **model_args):
 25 |         self.model_type = model_type.lower()
 26 |         self.model_name = model_name
 27 |         
 28 |         self.max_tokens = model_args['max_tokens']
 29 |         self.temperature_range = model_args['temperature_range']
 30 |         self.topp = model_args['top_p']
 31 |         # self.topk = model_args['top_k']
 32 |         self.port = model_args.get("port", 0)
 33 |         self.do_bs = model_args.get('beamsearch', 0)
 34 |         self.url = model_url
 35 |         # if self.port!="0": 
 36 |         self.url = f"http://127.0.0.1:{self.port}/v1/completions"
 37 |         print(f"running server on {self.url}")
 38 |         self.num_rollouts = num_rollouts
 39 |         
 40 |         self.payload = {
 41 |             "model":self.model_name,
 42 |             "max_tokens": self.max_tokens,
 43 |             "top_p": self.topp,
 44 |             "n": self.num_rollouts
 45 |         }
 46 |         self.tokenizer = tokenizer
 47 |         
 48 |         self.__dict__.update(model_args)
 49 |         print("Updated model args:", self.__dict__)
 50 |         
 51 |         if self.model_type == "vllm":
 52 |             #raise NotImplementedError("VLLM is not implemented yet")
 53 |             from vllm import LLM, SamplingParams
 54 |             self.llm = LLM(model=model_name, enable_prefix_caching=True)
 55 |             self.SamplingParams = SamplingParams
 56 |         elif self.model_type == "hf":
 57 |             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 58 |             self.model = AutoModelForCausalLM.from_pretrained(
 59 |                 model_name, torch_dtype=torch.float16, device_map="cuda"
 60 |             )
 61 |         elif self.model_type == "openai":
 62 |             import openai
 63 |             self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 64 |         elif self.model_type == "anthropic":
 65 |             import anthropic
 66 |             self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 67 |         elif self.model_type == "vllm_server":
 68 |             pass
 69 |         else:
 70 |             raise ValueError("Invalid model_type. Choose 'vllm', 'hf', 'openai', or 'anthropic'.")
 71 |         
 72 |     def generate(self, prompt, num_rollouts=None, isgreedy=False, **kwargs):
 73 |         if num_rollouts is None:
 74 |             num_rollouts = self.num_rollouts
 75 |         if self.model_type == 'vllm_server':
 76 |             return self.generate_vllm_server(prompt, num_rollouts, isgreedy=isgreedy, **kwargs)
 77 |         elif self.model_type == "vllm":
 78 |             return self.generate_vllm(prompt, num_rollouts)
 79 |         elif self.model_type == "hf":
 80 |             return self.generate_hf(prompt, num_rollouts)
 81 |         elif self.model_type == "anthropic" or self.model_type == "openai":
 82 |             return self.generate_api(prompt, num_rollouts)
 83 | 
 84 |     def generate_hf(self, prompt, num_rollouts):
 85 |         inputs = self.tokenizer(prompt, return_tensors="pt").to('cuda')
 86 |         print(prompt)
 87 |         results = []
 88 |         for _ in range(num_rollouts):
 89 |             temperature = random.uniform(self.temperature_range[0], self.temperature_range[1])
 90 |             outputs = self.model.generate(
 91 |                 **inputs, do_sample=True, max_new_tokens=self.max_tokens, temperature=temperature,
 92 |                 num_return_sequences=1
 93 |             )
 94 |             generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
 95 |             result = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
 96 |             results.append(result)
 97 |         pdb.set_trace()
 98 |         return results
 99 | 
100 |     def generate_vllm(self, prompt, num_rollouts):
101 |         #raise NotImplementedError("VLLM is not implemented yet")
102 |         # print(prompt)
103 |         temperature = random.choice(self.temperature_range)
104 |         sampling_params = self.SamplingParams(
105 |             temperature=temperature,
106 |             top_k=self.topk,
107 |             top_p=self.topp, 
108 |             max_tokens=self.max_tokens,
109 |             n=num_rollouts,
110 |             seed=SEED,
111 |             # stop=['\n'],
112 |         )
113 |         st = time.time()
114 |         outputs = self.llm.generate(prompt, sampling_params)
115 |         ed = time.time()
116 |         print(f'{num_rollouts} responses Time taken: {ed-st}')
117 |         result = [completion.text for output in outputs for completion in output.outputs]
118 |         return result, temperature
119 |     
120 |     def generate_vllm_server(self, prompt, num_rollouts=None, isgreedy=False, special_stop=None, prompt_only=False):
121 |         
122 |         temperature = np.random.uniform(low=self.temperature_range[0], high=self.temperature_range[1])
123 |         # temperature = random.choice(self.temperature_range)
124 |         payload = copy.copy(self.payload)
125 |         
126 |         payload.update({
127 |             "temperature": temperature,
128 |             # "messages": [
129 |             #     {"role": "system", "content": systemprompt},
130 |             #     {"role": "user", "content": query}
131 |             # ],
132 |             # "max_tokens": 4096,
133 |             "prompt": prompt,
134 |             'logprobs': 1 if not isgreedy else 0,
135 |         })
136 |         if num_rollouts is not None:
137 |             payload['n'] = num_rollouts
138 |             
139 |         if isgreedy:
140 |             payload['top_k'] = 1
141 |             payload['top_p'] = 1
142 |             payload['temperature'] = 0
143 |             payload['n'] = 1
144 |         
145 |         if special_stop:
146 |             payload['stop'] = special_stop
147 |         
148 |         if prompt_only:
149 |             payload.update({
150 |                 'prompt': prompt, 
151 |                 'n': 1,
152 |                 'temperature': 1.0, 
153 |                 'prompt_logprobs': 1
154 |             })
155 |             
156 |         # print(f'===> submitting request @{self.url}')
157 |         response = requests.post(self.url,
158 |                              headers={"User-Agent": "Test Client"},
159 |                              json=payload,
160 |                              stream=False)
161 |         # {"object":"error","message":"[{'type': 'missing', 'loc': ('body', 'model'), 'msg': 'Field required'
162 |         if response.status_code == 200:
163 |             result = response.json()
164 |             # print(num_rollouts, "Generated Text:", result)
165 |         else:
166 |             result = dict(choices=[])
167 |             print(f"Error: {response.status_code}, {response.text}")
168 |             return ([],[], []), None
169 |             message = json.loads(response.text)['message']
170 |             
171 |             if 'context length' in message:
172 |                 encoded = self.tokenizer(prompt)
173 |                 q_tokens = len(encoded['input_ids'])
174 |                 a_tokens_max = 4096-5-q_tokens 
175 |                 payload['max_tokens'] = a_tokens_max
176 |                 
177 |                 response = requests.post(self.url,
178 |                     headers={"User-Agent": "Test Client"},
179 |                     json=payload,
180 |                     stream=False
181 |                 )
182 |                 
183 |                 if response.status_code == 200:
184 |                     result = response.json()
185 |                     # print("Generated Text:", result)
186 |                 else:
187 |                     result = dict(choices=[])
188 |                     print(f"Error: {response.status_code}, {response.text}")
189 |                 
190 |                 
191 |         result_ = [item['text'] for item in result['choices']]
192 |         logps_ = [item['prompt_logprobs'] for item in result['choices']] if prompt_only else [item['logprobs']['token_logprobs'] for item in result['choices']]
193 |         offsets_ = result if prompt_only else [item['logprobs']['text_offset'] for item in result['choices']]
194 |         # offsets_ = [item['prompt_logprobs'] for item in result['choices']]
195 |         
196 |         offsets_ = result
197 |         return (result_,logps_, offsets_), temperature
198 | 
199 |     def generate_vl_vllm_server(self, conversation, num_rollouts=None, isgreedy=False, special_stop=None, prompt_only=False):
200 |         
201 |         temperature = np.random.uniform(low=self.temperature_range[0], high=self.temperature_range[1])
202 |         # temperature = random.choice(self.temperature_range)
203 |         payload = copy.copy(self.payload)
204 |         
205 |         payload.update({
206 |             "temperature": temperature,
207 |             "messages": conversation,
208 |             # "max_tokens": 4096,
209 |             # "prompt": prompt,
210 |             'logprobs': 1 if not isgreedy else 0,
211 |         })
212 | 
213 |         if num_rollouts is not None:
214 |             payload['n'] = num_rollouts
215 |             
216 |         if isgreedy:
217 |             payload['top_k'] = 1
218 |             payload['top_p'] = 1
219 |             payload['temperature'] = 0
220 |             payload['n'] = 1
221 |         
222 |         if special_stop:
223 |             payload['stop'] = special_stop
224 |         
225 |             
226 |         # print(f'===> submitting request @{self.url}')
227 |         response = requests.post(self.url.replace("completions","chat/completions"),
228 |                              headers={"User-Agent": "Test Client"},
229 |                              json=payload,
230 |                              stream=False)
231 | 
232 |         if response.status_code == 200:
233 |             result = response.json()
234 |             # print(num_rollouts, "Generated Text:", result)
235 |         else:
236 |             result = dict(choices=[])
237 |             print(f"Error: {response.status_code}, {response.text}")
238 |             # import pdb; pdb.set_trace()
239 |             return ([],[], []), None
240 |                 
241 |         result_ = [item['message']['content'] for item in result['choices']]
242 |         logps_ = [[x['logprob'] for x in item['logprobs']['content']] for item in result['choices']]  
243 |         offsets_ = None # result if prompt_only else [item['logprobs']['text_offset'] for item in result['choices']]
244 |         # offsets_ = [item['prompt_logprobs'] for item in result['choices']]
245 |         
246 |         offsets_ = result
247 |         return (result_,logps_, offsets_), temperature
248 | 
249 |     def generate_api(self, prompt: str, num_rollouts) -> List[str]:
250 |         def send_request(prompt):
251 |             temperature = random.choice(self.temperature_range)
252 |             if self.model_type == "openai":
253 |                 response = self.client.chat.completions.create(
254 |                     model=self.model_name,
255 |                     messages=[{"role": "user", "content": prompt}],
256 |                     max_tokens=self.max_tokens,
257 |                     temperature=temperature
258 |                 )
259 |                 output = response.choices[0].message.content
260 |             elif self.model_type == "anthropicc":
261 |                 response = self.client.messages.create(
262 |                     model=self.model_name,
263 |                     messages=[{"role": "user", "content": prompt}],
264 |                     max_tokens=self.max_tokens,
265 |                     temperature=temperature
266 |                 )
267 |                 output = response.content[0].text
268 |             return output
269 | 
270 |         responses = []
271 |         with ThreadPoolExecutor(max_workers=num_rollouts) as executor:
272 |             futures = [executor.submit(send_request, prompt) for _ in range(num_rollouts)]
273 |             for future in tqdm(as_completed(futures), total=len(futures)):
274 |                 responses.append(future.result())
275 | 
276 |         return responses
277 | 


--------------------------------------------------------------------------------
/synthesize_deep_reasoning/synthesize.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import copy
  4 | import json
  5 | import logging
  6 | import multiprocessing
  7 | import os
  8 | from glob import glob
  9 | from typing import Any, Dict, List, Tuple, Optional
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import ray
 14 | import transformers
 15 | import time
 16 | 
 17 | from helper import *
 18 | from model_utils import LM
 19 | 
 20 | default_sys = "You are a helpful assistant."
 21 | boxed_sysprompt = "Please reason step by step, and put your final answer within \\boxed{}."
 22 | 
 23 | templates = {
 24 | # serves as the standard inference without reference
 25 | "standard_inference_en": """You are an expert in many fields. Suppose you will give a specific final response, I need you to also write down the thought process behind this solution.
 26 | Here is a task:
 27 | {}
 28 | 
 29 | Now, you need to think aloud and brainstorm in the mind. The thinking process involves thoroughly exploring questions through a systematic long thinking process. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Present your complete thought process within a single and unique `<think></think>` tag.
 30 | 
 31 | Your thought process must adhere to the following requirements:
 32 | 
 33 | 1.  **Narrate in the first-person as if you are thinking aloud and brainstorming**
 34 |     Stick to the narrative of "I". Imagine you are brainstorming and thinking in the mind. Use verbalized, simple language.
 35 | 
 36 | 2.  **Unify the thinking process and the writing solution:**
 37 |     Your thought process must precisely correspond to a part of the writing solution. The reader should be able to clearly see how your thoughts progressively "grew" into the finished piece, making the copy feel like the inevitable product of your thinking.
 38 | 
 39 | 3.  **Tone of Voice: Planning, Sincere, Natural, and Accessible**
 40 |     Imagine you are analyzing and planning what to do before you start to wrtie the solution.  Your language should be plain and easy to understand, avoiding obscure professional jargon to explain complex thought processes clearly.
 41 | 
 42 | 4.  **Logical Flow: Clear and Progressive**
 43 |     
 44 | 5.  **Thinking Framework for deep thinking**
 45 |     To ensure your thinking is clear and deep, to showcase your thinking and planning to fulfill the task, below is what you might cover when you are thinking aloud and brainstorming.
 46 | 
 47 |     Understanding the user intent and the task: Before putting pen to paper, I need to thoroughly consider the fundamental purpose of the writing. I first need to discern the user's true goal behind their literal request. Next, I will consider: Who am I talking to? I will create a precise profile of the target reader, understanding their pain points, aspirations, and reading context. Then, I will establish the Core Objective: What specific emotional, cognitive, and behavioral changes do I most want the reader to experience after reading? 
 48 |     
 49 |     Establishing the content: I need to brainstorm a core creative idea and communication strategy centered around my objective. Then, I will think about what content and key information I need to convey to the reader to fulfill the writing task, and what source materials this will involve.
 50 | 
 51 |     Building the structure: I need to design a clear narrative path for the reader, like a "blueprint." First, I will plan the article's skeleton (e.g., using a framework like the Golden Circle "Why-How-What," the AIDA model "Attention-Interest-Desire-Action," or a narrative structure "Beginning-Development-Climax-Resolution"). Then, I will plan the key modules: How will the introduction hook the reader? How will the body be layered and the arguments arranged? How will the conclusion summarize, elevate the message, and provide a clear Call to Action (CTA)?
 52 | 
 53 |     Draft: unless it is a really easy request, otherwise I need to consider writing a draft based on the plan above, before you give the final writing solution.  I will translate my plan into paragraphs, considering the key points, content, and sentence structure for each. This initial draft should look like a prototype of the target text. This draft will be way shorter than the final writing solution within controlled length, but it must also avoid being too vague or general or simply copying the final text. I will begin writing this draft after a `--- The Draft ---` separator at the end of my thinking process. The draft will be included in the same `<think></think>` block. After writing the draft, I will further critique what can be improved, and analyze what details can be enriched (and hence make it more likely to eventually arrive at the given solution)
 54 |     
 55 | 6. Throughout the thinking process, I want to involve deep thinking and planning, and use deliberate self-critique/self-reflection in my thinking process. Trigger these by frequently using patterns such as `wait`, `maybe`, `let me`, etc. For example:
 56 |     - Hmm, maybe .. (other concrete thinking regarding the given request)
 57 |     - Let me think .. 
 58 |     - Wait no ..
 59 |     - But wait ..(might find something wrong with your previous thoughts)
 60 |     - Wait, that's a bit ..(reflections about previous decisions). Let me think .. (are thinking of other possibilities)
 61 |     - Wait, the user said ..(backtracking of previous information). So ..
 62 |     - Hmm...Alternatively, maybe ..(branching on other possibilities)
 63 |     - But ..
 64 | 
 65 | Now record your clear, complete, and logical thinking process within `<think></think>` tags. 
 66 | In the thinking process, make sure NO PAST TENSES, NO PAST TENSES, because this is the thought process before you are to write a final solution. You are planning what you will and you need to do.
 67 | Imagine you're thinking aloud and brainstorming. Write it as an internal monologue or a stream of consciousness. Do not use bullet points, numbers, or formal section headings. 
 68 | """,
 69 | "standard_inference_cn": """你是一名各种领域专家，设想有一个用户请求和一个回答。现在你需要针对回答解释思考过程，特别是如何针对这个请求进行深入思考、深入规划的。
 70 | 下面是用户请求：\n{}
 71 | 
 72 | 现在你需要头脑风暴，在（单独且唯一的）`<think></think>`标签中呈现你的完整思考过程。
 73 | 
 74 | 思考过程必须遵循以下要求：
 75 | 关于叙述视角：使用第一人称，想象你在大脑里头脑风暴，演绎自己的创作思考过程。用口语化的表述和通俗的语言。
 76 | 
 77 | 关于语言风格：未来时、真诚、自然、易懂
 78 | 设想你在动笔前分析和规划的思考过程，所以应该是用未来、计划性或者“我应该”这种语气。请用真诚、坦率的口吻，像一位经验丰富的前辈在传授经验。语言要平实、易懂，避免使用晦涩的专业术语，把复杂的思考过程说明白。
 79 | 
 80 | 关于思考的逻辑：清晰、层层递进
 81 | 整个思考过程需要展现出清晰的因果链条，层层递进，解释“为什么这么想”以及“这样做预计会带来什么效果”。思考过程中，基于上面的写作框架中的核心步骤，不断进行细节拆分，使用多样化的逻辑连接词，例如“首先”、“其次”、“然后”、来逐步递进思考过程，完善细节。避免反复使用相同的连接词。
 82 | 
 83 | 思维框架：
 84 | 为了让思考过程清晰、有深度，我建议你采用下面的创作框架来组织思路。这能让你的思考过程更接近一位真实专家的工作流：
 85 | 
 86 | 首先思考，我为何而写？在动笔前，我会先彻底想清楚写作的根本目的。我需要先洞察用户字面需求背后的真实目标，接着思考：我在对谁说话？精准描绘出目标读者的画像，理解他们的痛点、渴望和阅读场景。然后，确立核心目标： 我最希望读者读完后，在情感、认知和行动上发生什么具体变化？
 87 | 
 88 | 然后确立内容，我要写什么？我需要围绕写作目标构思核心创意和沟通策略，规划内容。然后思考，为了完成用户请求，我需要向读者传递包括哪些内容和关键信息，分别涉及到什么素材。
 89 | 
 90 | 接着搭建结构，思考我要怎么写？我需要设计一个清晰的行文路径，像“施工图”一样引导读者。首先，我需要规划文章骨架（例如：黄金圈法则 "Why-How-What"、AIDA 模型 "Attention-Interest-Desire-Action"、故事结构 "开端-发展-高潮-结尾"等）。然后，我要考虑布局关键模块： 开头如何破题？主体如何分层展开、安排论据？结尾如何总结、升华，并给出清晰的行动号召 (Call to Action)？
 91 | 
 92 | 再然后，除非是很显然很容易的请求，否则考虑先写一个草稿。我需要落实到每一个段落，具体考虑有什么要点，写什么内容，句子如何组织。要让这份初稿看起来像是上面的文案的雏形，但是要避免照搬上面的文案，又要避免语言笼统。`--- 草稿 ---`分割线后开始写你的草稿，但是草稿部分和上面的思考过程都要放在同一个`<think></think>`标签内
 93 | 草稿结束后再次思考有什么可以进一步调整的细节、或者进一步优化的地方，这也是为什么要和真正最终回答有所区分，应当是最终回答的雏形
 94 | 
 95 | 为了充分思考和深入推理，我会多使用自我反思和自我评判来进一步展开细节、分支其他方面或者回溯思考之前的一些陈述。我会频繁利用一些触发自我反思和自我批判的词语：“不过”、“或者”、“可能”，用这些词来触发更加细节、更加深入的思考，下面是一些例子：
 96 |     - 嗯，也许……（关于给定请求的其他具体思考）
 97 |     - 让我想想……
 98 |     - 等等，不对……
 99 |     - 不过等等……（可能会发现你之前的想法有问题）
100 |     - 等等，这有点……（对先前决定的反思）。让我想想……（正在思考其他可能性）
101 |     - 等等，用户说……（回溯之前的信息）。所以……
102 |     - 嗯……或者，也许……（思考其他分支可能性）
103 |     - 但是……
104 | 格式上，将清晰完整有逻辑的思考过程在`<think></think>`标签中记录。
105 | 在思考过程中，确保不要使用过去时，不要使用过去时，因为这是在你写最终解决方案之前的思考过程。你正在计划你将要做什么和需要做什么。
106 | 想象你正在出声思考和进行头脑风暴。把它写成内心独白或意识流。不要使用项目符号、编号或正式的章节标题。下面，设想你是首次拿到这个用户请求，然后开始你的思考（不要暗示你在解释一个回答。
107 | """,
108 | "initial_thinking_en": """You are an expert in many fields. Suppose you will give a specific final response, I need you to also write down the thought process behind this solution.
109 | Here is a task:
110 | {}
111 | 
112 | Here is the solution you will create:
113 | {}
114 | 
115 | Now, you need to write down the thinking process behind this solution, as if you are thinking aloud and brainstorming in the mind. The thinking process involves thoroughly exploring questions through a systematic long thinking process. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Present your complete thought process within a single and unique `<think></think>` tag.
116 | 
117 | Your thought process must adhere to the following requirements:
118 | 
119 | 1.  **Narrate in the first-person as if you are thinking aloud and brainstorming**
120 |     Stick to the narrative of "I". Imagine you are brainstorming and thinking in the mind. Use verbalized, simple language.
121 | 
122 | 2.  **Unify the thinking process and the writing solution:**
123 |     Your thought process must precisely correspond to a part of the writing solution. The reader should be able to clearly see how your thoughts progressively "grew" into the finished piece, making the copy feel like the inevitable product of your thinking.
124 | 
125 | 3.  **Tone of Voice: Planning, Sincere, Natural, and Accessible**
126 |     Imagine you are analyzing and planning what to do before you start to wrtie the solution.  Your language should be plain and easy to understand, avoiding obscure professional jargon to explain complex thought processes clearly.
127 | 
128 | 4.  **Logical Flow: Clear and Progressive**
129 |     
130 | 5.  **Thinking Framework for deep thinking**
131 |     To ensure your thinking is clear and deep, to showcase your thinking and planning to fulfill the task, below is what you might cover when you are thinking aloud and brainstorming.
132 | 
133 |     Understanding the user intent and the task: Before putting pen to paper, I need to thoroughly consider the fundamental purpose of the writing. I first need to discern the user's true goal behind their literal request. Next, I will consider: Who am I talking to? I will create a precise profile of the target reader, understanding their pain points, aspirations, and reading context. Then, I will establish the Core Objective: What specific emotional, cognitive, and behavioral changes do I most want the reader to experience after reading? 
134 |     
135 |     Establishing the content: I need to brainstorm a core creative idea and communication strategy centered around my objective. Then, I will think about what content and key information I need to convey to the reader to fulfill the writing task, and what source materials this will involve.
136 | 
137 |     Building the structure: I need to design a clear narrative path for the reader, like a "blueprint." First, I will plan the article's skeleton (e.g., using a framework like the Golden Circle "Why-How-What," the AIDA model "Attention-Interest-Desire-Action," or a narrative structure "Beginning-Development-Climax-Resolution"). Then, I will plan the key modules: How will the introduction hook the reader? How will the body be layered and the arguments arranged? How will the conclusion summarize, elevate the message, and provide a clear Call to Action (CTA)?
138 | 
139 |     Outline: If the task output might be relatively long, I will consider writing an outline (or a draft) which naturally derives from the plan above. Specifically, the outline will ground my plan into paragraphs, summarizing the key content for each paragraph and what are the key points here, sentence structure or anything important for the paragraph. 
140 |     I PROMISE I will NOT copy the solution I will NOT copy the solution, this outline (or draft) should only look like a prototype or outline of the target text. After finishing this outline, I will check again if there are any details or notes I should pay attention to when writing the final solution.
141 |     I will begin writing this draft after a `--- Outline (or Draft) ---` separator at the end of my thinking process. The draft will be included in the same `<think></think>` block.
142 |     
143 |     
144 | 6. Throughout the thinking process, I want to involve deep thinking and planning, and use deliberate self-critique/self-reflection in my thinking process. Trigger these by regularly using patterns such as `wait`, `maybe`, `let me`, etc. For example:
145 |     - Hmm, maybe .. (other concrete thinking regarding the given request)
146 |     - Let me think .. 
147 |     - Wait no ..
148 |     - But wait ..(might find something wrong with your previous thoughts)
149 |     - Wait, that's a bit ..(reflections about previous decisions). Let me think .. (are thinking of other possibilities)
150 |     - Wait, the user said ..(backtracing of previous information). So ..
151 |     - Hmm...Alternatively, maybe ..(branching on other possibilities)
152 |     - But ..
153 | But I promise I will use diverse triggers and will NOT use same triggers repeatedly. I will use these when analyzing user needs, establishing content and structure and when I consider alternatives, backtracing and the details. I will NOT use them when I write the draft or I am approaching the end of thinking. 
154 | 
155 | In the thinking process, make sure NO PAST TENSES, NO PAST TENSES, because this is the thought process before you are to write a final solution. You are planning what you will and you need to do.
156 | Imagine you're thinking aloud and brainstorming. Write it as an internal monologue or a stream of consciousness. Do not use bullet points, numbers, or formal section headings. 
157 | Now record your thinking process within `<think></think>` tags. 
158 | """,
159 | "initial_thinking_cn": """你是一名各种领域专家，设想有一个用户请求，你为此正在头脑风暴并且，并且把你的深入思考记录下来。
160 | 下面是用户请求：\n{}
161 | 
162 | 假设下面是你会完成的文案：\n{}
163 | 
164 | 现在你需要写出对应的思考过程，就像在大脑里头脑风暴。在（单独且唯一的）`<think></think>`标签中呈现你的完整思考过程。
165 | 
166 | 思考过程必须遵循以下要求：
167 | 1. 关于叙述视角：使用第一人称，想象你在大脑里头脑风暴，演绎自己的创作思考过程。用口语化的表述和通俗的语言。
168 | 
169 | 2. 关于思维与作品的统一：思考即作品，作品即思考
170 | 你的每一个思考步骤，都必须在最终的文案中找到精准的对应。要让读者清晰地看到，你的思考是如何一步步“长”成这篇作品的，整个复盘过程要让人觉得，这篇文案正是这些思考的必然产物。
171 | 
172 | 3. 关于语言风格：未来时、真诚、自然、易懂
173 | 设想你在动笔前分析和规划的思考过程，所以应该是用未来、计划性或者“我应该”这种语气。请用真诚、坦率的口吻，像一位经验丰富的前辈在传授经验。语言要平实、易懂，避免使用晦涩的专业术语，把复杂的思考过程说明白。
174 | 
175 | 4. 关于思考的逻辑：清晰、层层递进
176 | 整个思考过程需要展现出清晰的因果链条，层层递进，解释“为什么这么想”以及“这样做预计会带来什么效果”。思考过程中，基于上面的写作框架中的核心步骤，不断进行细节拆分，使用多样化的逻辑连接词，例如“首先”、“其次”、“然后”、来逐步递进思考过程，完善细节。避免反复使用相同的连接词。
177 | 
178 | 5. 思维框架：
179 | 对于给定的用户请求，一个清晰、有深度、细节丰富的思考过程可能包含下面这些内容和思考方向：
180 | 
181 | 为何而写？在动笔前，我会先彻底想清楚写作的根本目的。我需要先洞察用户字面需求背后的真实目标，接着思考：我在对谁说话？精准描绘出目标读者的画像，理解他们的痛点、渴望和阅读场景。然后，确立核心目标： 我最希望读者读完后，在情感、认知和行动上发生什么具体变化？
182 | 
183 | 确立内容，我要写什么？我需要围绕写作目标构思核心创意和沟通策略，规划内容。然后思考，为了完成用户请求，我需要向读者传递包括哪些内容和关键信息，分别涉及到什么素材。
184 | 
185 | 搭建结构，思考我要怎么写？我需要设计一个清晰的行文路径，像“施工图”一样引导读者。首先，我需要规划文章骨架（例如：黄金圈法则 "Why-How-What"、AIDA 模型 "Attention-Interest-Desire-Action"、故事结构 "开端-发展-高潮-结尾"等）。然后，我要考虑布局关键模块： 开头如何破题？主体如何分层展开、安排论据？结尾如何总结、升华，并给出清晰的行动号召 (Call to Action)？
186 | 
187 | 如果是需要输出相对比较长的回答，我会考虑先写一个提纲（或者草稿），会对于参考回答进行提纲挈领，并且列出来每个段落或者部分有什么要点，写什么内容，句子如何组织。
188 | 我**绝对不会照抄绝对不会照抄**参考回答。我会让这个看起来像是一个雏形或者大纲，而不是照搬上面的文案。写完这个提纲之后，我可能会总结一下最终的回答还有没有什么细节需要主要
189 | 我会在`--- 提纲（或者草稿） ---`分割线后开始，和上面的思考过程都要放在同一个`<think></think>`标签内
190 | 
191 | 
192 | 6. 为了充分思考和深入推理，我会多使用自我反思和自我评判来进一步展开细节、分支其他方面或者回溯思考之前的一些陈述。我会利用一些触发自我反思和自我批判的词语：“不过”、“或者”、“可能”，用这些词来触发更加细节、更加深入的思考，下面是一些例子：
193 |     - 嗯，也许……（关于给定请求的其他具体思考）
194 |     - 让我想想……
195 |     - 等等，不对……
196 |     - 不过等等……（可能会发现你之前的想法有问题）
197 |     - 等等，这有点……（对先前决定的反思）。让我想想……（正在思考其他可能性）
198 |     - 等等，用户说……（回溯之前的信息）。所以……
199 |     - 嗯……或者，也许……（思考其他分支可能性）
200 |     - 但是……
201 | 但是我保证会多样化而且不会反复使用相同的触发词。我会在思考用户需求、确定内容和搭建结构的时候考虑使用这些，特别是需要回溯思考、分支或者展开细节的时候使用。如果是在写草稿，或者是快要完成思考的阶段，我不去使用这些触发词。
202 | 
203 | 现在，想象我正在内心独白进行头脑风暴……在`<think></think>`标签中。
204 | """,
205 | # writing的核心是align
206 | "segment_edits_en": """
207 | Your task is to receive a writing assignment, a target sample text, and an existing thinking process, and then to refine and enrich a specific paragraph within that thinking process.
208 | 
209 | ---> **Writing Task** \n{} 
210 | ---> **Target Text** \n{} 
211 | ---> **Thinking Process** 
212 | {}
213 | <replace> 
214 | {}
215 | </replace>
216 | {}
217 | 
218 | Follow this three-step method to construct your response:
219 | 
220 | **Step 1:** Locate the paragraph you need to revise within the existing thinking process. In relation to the surrounding context, what is the primary function of this paragraph?
221 | 
222 | **Step 2:** Read the "Target Text" and the "paragraph to be revised" side-by-side. Ask yourself a key question: Does the thinking process reflected in this paragraph lack crucial steps, or is there content that could be further optimized and detailed to better align with the Target Text?
223 | 
224 | **Step 3:** Improve and optimize the paragraph (that represents a part of the thinking process). 
225 | - Based on the analysis, refine the initial target paragraph into a new one, base remain the tone. Put the refinement into <refine></refine> tags.
226 | - To help involve deep thinking and planning, consider deliberate self-critique/self-reflection in your thinking process. Trigger these by frequently using patterns such as `wait`, `maybe`, `let me`, etc. For example:
227 |     - Hmm, maybe .. (other concrete thinking regarding the given request)
228 |     - Let me think .. 
229 |     - Wait no ..
230 |     - But wait ..(might find something wrong with your previous thoughts)
231 |     - Wait, that's a bit ..(reflections about previous decisions). Let me think .. (are thinking of other possibilities)
232 |     - Wait, the user said ..(backtracking of previous information). So ..
233 |     - Hmm...Alternatively, maybe ..(branching on other possibilities)
234 |     - But ..
235 | - If the function of the paragraph being improved is to serve as a first draft of the text, you must focus on enhancing the text's logic and completeness. The draft should not be a general outline but should express specific content and state a clear point of view. Consider whether the current draft is an appropriate prototype for the Target Text: it should be neither too vague nor a direct copy, but should reflect a foundational version.
236 | 
237 | Based on the guide above, you are to refine **only** the section marked for replacement below.
238 | <replace>
239 | {}
240 | </replace>
241 | 
242 | In your response, first, present your analysis following the three-step method within `<analyze></analyze>` tags. Finally, place the corresponding, refined paragraph of the **thinking process** within `<refine></refine>` tags. 
243 | Notes: a. Avoid repeating. Reduce the use of the same connection words, avoid repeating the same meanings over and over again. Ensure that your revised content does not repeat information from the context.
244 | b. please keep the first a few words of the original paragraph, especially the connection words 
245 | c. use self-critique trigger words, such as `wait`, `maybe`, `let me`, etc. 
246 | """,
247 | # 不能让他直接refine，还是要进行分析
248 | "segment_edits_cn": """你会接收一个用户请求、一篇目标范文和一个已有的思考过程，然后对该思考过程的某个段落进行优化和丰富。
249 | ---> 用户请求
250 | {}
251 | ---> 目标文案
252 | {}
253 | ---> 思考过程
254 | {}
255 | ----
256 | <replace>
257 | {}
258 | </replace>
259 | ----
260 | {}
261 | 
262 | 遵循以下三步法来构建我的回答：
263 | 
264 | 第一步：定位我需要修改的段落在现有思考过程中的位置，相对于目前的上下文而言，这个段落主要是什么功能。
265 | 第二步：并排阅读“目标文案”和“需要修改的段落”。问自己一个核心问题：需要修改的段落所反映的思考过程，是否缺少了关键的思考步骤，或者有没有可以进一步优化、进一步细化的内容，能够更好地对应到目标文案？
266 | 第三步：改进和优化现有的思考过程。
267 | - 根据第二步中的分析，修改这个段落，注意放在<refine></refine>标签重。
268 | - 为了充分思考和深入推理，我会多使用自我反思和自我评判来进一步展开细节、分支其他方面或者回溯思考之前的一些陈述。我会频繁利用一些触发自我反思和自我批判的词语：“不过”、“或者”、“可能”，用这些词来触发更加细节、更加深入的思考，下面是一些例子：
269 |     - 嗯，也许……（关于给定请求的其他具体思考）
270 |     - 让我想想……
271 |     - 等等，不对……
272 |     - 不过等等……（可能会发现你之前的想法有问题）
273 |     - 等等，这有点……（对先前决定的反思）。让我想想……（正在思考其他可能性）
274 |     - 等等，用户说……（回溯之前的信息）。所以……
275 |     - 嗯……或者，也许……（思考其他分支可能性）
276 |     - 但是……
277 | - 如果需要改进的段落的功能是作为文案的初稿，务必注意改进和优化文案的逻辑性、完整性：初稿不应该是笼统的大纲，而应该具体地表达内容、陈述观点。考虑当前的初稿是否是目标文案的一个恰当的草稿：既不能太笼统，也不能照抄，而应该反映出是一个雏形。
278 | 
279 | 基于上述指南，仅仅针对下面需要替换的部分进行优化。
280 | <replace>
281 | {}
282 | </replace>
283 | 
284 | 在下面的回答中，首先遵循三步法进行分析，放在`<analyze></analyze>`标签中，最后将我修改后的**思考过程的对应段落**放在`<refine></refine>`标签。
285 | 注意：1. 尽可能避免重复，减少反复使用的衔接词，修改后的内容不要和上下文内容有重复。
286 | 2. 务必保留段落最开始的几个词，特别是连接词或语气词。
287 | 3. 多使用反思触发词激发更深入的思考
288 | """,
289 | }
290 | 
291 | think_prefix = "<think>\n"
292 | 
293 | import re
294 | 
295 | def contains_chinese(text: str) -> bool:
296 |   """
297 |   Checks if a string contains any Chinese characters.
298 | 
299 |   Args:
300 |     text: The input string.
301 | 
302 |   Returns:
303 |     True if the string contains at least one Chinese character, False otherwise.
304 |   """
305 |   # The \u4e00-\u9fff range covers the CJK Unified Ideographs.
306 |   # This is the most common range for Chinese characters.
307 |   return bool(re.search(r'[\u4e00-\u9fff]', text))
308 | 
309 | @ray.remote
310 | def generate(inputs, model: LM, num_rollouts=None, isgreedy=True, **kwargs):
311 |     results = []
312 |     
313 |     if len(inputs)==1:
314 |         completions, temperature = model.generate(inputs, num_rollouts, isgreedy, **kwargs)
315 |         results = completions
316 |     else: 
317 |         for inp in inputs:
318 |             completions, temperature = model.generate(inp, num_rollouts, isgreedy, **kwargs)
319 |             results.append(completions) 
320 |     return results
321 | 
322 | 
323 | lm_tokenizer, lm_model, post_tokenizer, post_model = None, None, None, None
324 | def get_model_output(
325 |     template_role: str,
326 |     system_prompt: Optional[str],
327 |     template_inputs: List[List[Any]],
328 |     tokenizer: Any,
329 |     model: Any,
330 |     prompt_suffix: str,
331 |     # NOTE: use_fewshot is an unused parameter in this function.
332 |     use_fewshot: bool,
333 |     num_rollouts: int,
334 |     is_greedy: bool = True,
335 |     **kwargs,
336 | ) -> Tuple[Any, List[str]]:
337 |     """
338 |     Constructs prompts from templates and submits them to the model for generation.
339 | 
340 |     Args:
341 |         template_role: The key for the desired prompt template.
342 |         system_prompt: An optional system message to guide the model's behavior.
343 |         template_inputs: A list of lists, where each inner list contains the arguments for a prompt template.
344 |         tokenizer: The model's tokenizer.
345 |         model: The language model instance.
346 |         prompt_suffix: A string to append to each prompt after formatting.
347 |         use_fewshot: (Unused) A flag that was likely intended for few-shot prompting.
348 |         num_rollouts: The number of sequences to generate for each prompt.
349 |         is_greedy: A flag to control the decoding strategy.
350 |         **kwargs: Additional arguments passed to the generation function.
351 | 
352 |     Returns:
353 |         A tuple containing the Ray object for the asynchronous generation task
354 |         and the list of fully constructed prompts sent to the model.
355 |     """
356 |     template = templates[template_role]
357 |     
358 |     # Format each input using the specified template.
359 |     formatted_queries = [template.format(*inp) for inp in template_inputs]
360 |     
361 |     prompts = []
362 |     for query in formatted_queries:
363 |         # Create the standard message format for model interaction.
364 |         messages = [{"role": "user", "content": query}]
365 |         
366 |         if system_prompt:
367 |             messages.insert(0, {"role": "system", "content": system_prompt})
368 |         
369 |         # Construct the final prompt string.
370 |         prompt = make_prompt(tokenizer, messages)
371 |         prompt += prompt_suffix
372 |         prompts.append(prompt)
373 |     # if kwargs.get('log',False):
374 |     #     print(prompts)
375 |     # Launch the remote generation task using Ray.
376 |     generation_task = generate.remote(prompts, model, num_rollouts, isgreedy=is_greedy, **kwargs)
377 |     return generation_task, prompts
378 | 
379 | class RefinementProcessor:
380 |     """Handles the iterative refinement process for a single generated response."""
381 | 
382 |     def __init__(self, node: Any, tokenizer: Any, model: Any, post_tokenizer: Any, post_model: Any, stop_threshold: float, max_steps: int, num_expansion: int = 2):
383 |         self.node = node
384 |         self.tokenizer = tokenizer
385 |         self.model = model
386 |         self.post_tokenizer = post_tokenizer
387 |         self.post_model = post_model
388 |         self.stop_threshold = stop_threshold
389 |         self.max_steps = max_steps
390 |         self.num_expansion = num_expansion
391 | 
392 |     def run(self, initial_info: Dict[str, Any]) -> Dict[str, Any]:
393 |         """
394 |         Runs the full iterative refinement loop on a generated "thinking" process.
395 | 
396 |         Args:
397 |             initial_info: A dictionary containing the initial model rollout and its metadata.
398 | 
399 |         Returns:
400 |             An updated info dictionary containing the results of the refinement process.
401 |         """
402 |         # The 'initial_ppl' is a tuple where the first element is the log perplexity.
403 |         initial_perplexity = initial_info.get('initial_ppl', (1000.0,))
404 |         print(f"===> Initial PPL: {initial_perplexity[0]}")
405 |         if initial_perplexity[0] < self.stop_threshold:
406 |             print("===> Skipping refinement due to low initial PPL.")
407 |             return initial_info
408 | 
409 |         thinking_segments = initial_info.get('thinking_segments', [])
410 |         if not thinking_segments:
411 |             return initial_info
412 | 
413 |         print(f"====> Starting refinement on {len(thinking_segments)} thinking steps.")
414 |         
415 |         finalized_thinking_steps = []
416 |         best_ppl_so_far = initial_perplexity
417 |         
418 |         # Iterate through each segment of the thinking process to refine it.
419 |         for i in range(min(self.max_steps, len(thinking_segments))):
420 |             # Reconstruct the thinking process parts: before, current, and after the segment being refined.
421 |             before_segment = "\n\n".join(finalized_thinking_steps)
422 |             segment_to_refine = thinking_segments[i].strip()
423 |             after_segments = "\n\n".join(s.strip() for s in thinking_segments[i+1:])
424 |             
425 |             # Generate and evaluate several possible refinements for the current segment.
426 |             best_candidate, best_ppl, all_candidates = self._refine_one_step(
427 |                 before_segment, segment_to_refine, after_segments
428 |             )
429 |             
430 |             # Decide whether to keep the original segment or use a generated refinement.
431 |             choice = 'original'
432 |             chosen_text = segment_to_refine
433 |             if best_candidate is not None and best_ppl[0] < best_ppl_so_far[0]:
434 |                 best_ppl_so_far = best_ppl
435 |                 chosen_text = best_candidate['refinement']
436 |                 choice = f"refinement_No.{best_candidate['id']}"
437 | 
438 |             finalized_thinking_steps.append(chosen_text)
439 |             
440 |             # Log the details of this refinement step.
441 |             initial_info[f"refine_thinking_step_No.{i+1}"] = {
442 |                 'segment_to_refine': segment_to_refine,
443 |                 'choice': choice,
444 |                 'chosen_refinement': chosen_text,
445 |                 'after_avg_token_logp': best_ppl_so_far[0],
446 |                 'possible_refinements': all_candidates,
447 |             }
448 | 
449 |             if best_ppl_so_far[0] < self.stop_threshold:
450 |                 print("===> Stopping refinement early as PPL threshold reached.")
451 |                 break
452 | 
453 |         if len(finalized_thinking_steps)<len(thinking_segments):
454 |             # Append the rest of the original segments without refinement.
455 |             finalized_thinking_steps.extend(s.strip() for s in thinking_segments[i+1:])
456 |         
457 |         initial_info['corrected_thinking'] = finalized_thinking_steps
458 |         early_terminate = i+1<len(thinking_segments)
459 |         initial_info['finalized_avg_token_logp'] = best_ppl_so_far[0]
460 |         # If any refinement was done, perform one final rollout with the complete refined thinking.
461 |         # if finalized_thinking_steps and early_terminate:
462 |         #     new_thinking_prompt = "<think>\n" + "\n\n".join(finalized_thinking_steps)
463 |         #     role_suffix = '_cn' if self.node.memory['is_chinese'] else '_en'
464 |         #     rollout_texts, _, _, _ = direct_rollout(self.node, new_thinking_prompt, 1, role="standard_inference"+role_suffix)
465 |         #     initial_info['finalized_response'] = rollout_texts[0] if len(rollout_texts) else None
466 | 
467 |         return initial_info
468 | 
469 |     def _refine_one_step(self, before: str, current: str, after: str) -> Tuple[Optional[Dict], Tuple[float, List], List[Dict]]:
470 |         """Generates and evaluates possible refinements for a single thinking segment."""
471 |         num_samples = self.num_expansion
472 |         # The arguments are: question, reference_answer, text_before, text_to_replace, text_after, original_text
473 |         prompt_args = (self.node.memory['q'], self.node.memory['ref'], before, current, after, current)
474 |         role_suffix = '_cn' if self.node.memory['is_chinese'] else '_en'
475 |         pre_trigger = "Let's find out what can be improved and enriched to better align with the target text.\n" if role_suffix=='_en' else "让我看看这段有什么可以做修改、优化、补充的地方，从而更加贴合目标文本\n"
476 |         rollouts_obj, _ = get_model_output(
477 |             template_role='segment_edits'+role_suffix, 
478 |             system_prompt=default_sys, 
479 |             template_inputs=[prompt_args], 
480 |             tokenizer=self.tokenizer, 
481 |             model=self.model, 
482 |             prompt_suffix=pre_trigger+"<analyze>\n",
483 |             use_fewshot=False,
484 |             num_rollouts=num_samples, 
485 |             is_greedy=False
486 |         )
487 |         # The return from ray.get should be (texts, logps, offsets), but only texts are used here.
488 |         rollout_texts, _, _ = ray.get(rollouts_obj)
489 |         del rollouts_obj
490 | 
491 |         refinement_candidates = []
492 |         best_candidate_info = None
493 |         # Initialize with a high perplexity value.
494 |         best_perplexity = (float('inf'), [])
495 |         if np.random.uniform()<0.5:
496 |             eng_trigger = "Wait"
497 |         elif np.random.uniform()<0.8:
498 |             eng_trigger = "But wait"
499 |         else:
500 |             eng_trigger = "Meanwhile"
501 |         trigger = eng_trigger if role_suffix == '_en' else "等等我再想想"
502 |         for i, rollout_text in enumerate(rollout_texts):
503 |             # if np.random.uniform()<0.25: # extra wait 
504 |             #     temp = rollout_text.split('</refine>')[0].strip()
505 |             #     rollouts_obj2, _ = get_model_output(
506 |             #         template_role='segment_edits'+role_suffix, 
507 |             #         system_prompt=default_sys, 
508 |             #         template_inputs=[prompt_args], 
509 |             #         tokenizer=self.tokenizer, 
510 |             #         model=self.model, 
511 |             #         prompt_suffix=pre_trigger+"<analyze>\n"+rollout_text+f"\n\n{trigger}",
512 |             #         use_fewshot=False,
513 |             #         num_rollouts=1, 
514 |             #         is_greedy=False
515 |             #     )
516 |             #     # The return from ray.get should be (texts, logps, offsets), but only texts are used here.
517 |             #     rollout_texts2, _, _ = ray.get(rollouts_obj2)
518 |             #     del rollouts_obj2
519 |             #     new_roll = f"{temp}\n\n{trigger}{rollout_texts2[0]}"
520 |             #     rollout_text = new_roll
521 | 
522 |             # Extract the refined text from within the <refine> tags.
523 |             last_block_start = rollout_text.rfind("<refine>")
524 |             if last_block_start == -1:
525 |                 print("Warning: <refine> tag not found in output, skipping.")
526 |                 continue
527 | 
528 |             block_end = rollout_text.rfind("</refine>")
529 |             start_pos = last_block_start + len("<refine>")
530 |             refinement_text = rollout_text[start_pos:block_end if block_end != -1 else None].strip()
531 |             
532 |             # Create the full "thinking" process with the new refinement.
533 |             recomposed_thinking = f"{refinement_text}"
534 |             if before: recomposed_thinking = f"{before}\n\n{recomposed_thinking}"
535 |             if after: recomposed_thinking = f"{recomposed_thinking}\n\n{after}"
536 |             
537 |             # Evaluate the new thinking process by calculating the perplexity of the reference answer.
538 |             manager = PosteriorManager('standard_inference'+role_suffix, default_sys, [[self.node.memory['q']]], self.post_tokenizer, self.post_model, self.node.memory['ref'])
539 |             _, posterior_prefix = manager.prepare(think_prefix, recomposed_thinking, "")
540 |             ppl_obj, _ = manager.submit(posterior_prefix)
541 |             # The 'compute' method returns a tuple (log_perplexity, debug_info).
542 |             new_perplexity_result = manager.compute(ppl_obj)
543 | 
544 |             print(f"Refinement candidate {i} | New log PPL: {new_perplexity_result[0]}")
545 |             candidate_info = {
546 |                 'id': i,
547 |                 'refinement': refinement_text,
548 |                 'generator': self.model.model_name,
549 |                 'raw_output': rollout_text,
550 |                 'raw_input_for_posterior': posterior_prefix,
551 |                 'avg_token_logp': new_perplexity_result[0],
552 |             }
553 |             refinement_candidates.append({f'expansion_No.{i}_of_segment': candidate_info})
554 | 
555 |             # If this candidate is better than the best one so far, update it.
556 |             if new_perplexity_result[0] < best_perplexity[0]:
557 |                 best_perplexity = new_perplexity_result
558 |                 best_candidate_info = candidate_info
559 |                 
560 |         return best_candidate_info, best_perplexity, refinement_candidates
561 |     
562 | class PosteriorManager:
563 |     """Calculates the log probability of a reference answer given a thinking process."""
564 |     def __init__(self, role: str, system_prompt: str, inputs: List[List[Any]], tokenizer: Any, model: Any, ref_answer: str):
565 |         self.role = role
566 |         self.system_prompt = system_prompt
567 |         self.inputs = inputs
568 |         self.tokenizer = tokenizer
569 |         self.model = model
570 |         self.ref_answer = ref_answer
571 |         _, _, answer_steps = breakdown_steps(ref_answer)
572 |         self.answer_prefix = answer_steps[0]
573 |         self.pred_answer = "".join(answer_steps[1:])
574 |         
575 |     def prepare(self, think_prefix: str, thinking_process: str, ref_answer: str = None) -> Tuple[str, str]:
576 |         """Prepares the prompt for posterior probability calculation."""
577 |         # The part of the prompt before the reference answer.
578 |         self.prefix_before_answer = f"{think_prefix}{thinking_process}\n</think>\n<answer>\n{self.answer_prefix}"
579 |         # The full prompt including the reference answer.
580 |         
581 |         self.posterior_prefix = f"{self.prefix_before_answer}{self.pred_answer}\n</answer>"
582 |         return self.prefix_before_answer, self.posterior_prefix
583 |     
584 |     def submit(self, prefix: str) -> Tuple[Any, List[str]]:
585 |         """Submits the prompt to the model to get token log probabilities."""
586 |         self.rollouts_obj, self.real_input_prompts = get_model_output(
587 |             self.role, self.system_prompt, self.inputs, self.tokenizer, self.model, prefix, 
588 |             use_fewshot=False, num_rollouts=1, is_greedy=False, prompt_only=True
589 |         )
590 |         return self.rollouts_obj, self.real_input_prompts
591 | 
592 |     def compute(self, rollouts_obj: Any) -> Tuple[float, List[Dict]]:    
593 |         """Computes the log perplexity of the reference answer from the model's output."""
594 |         _, prompt_logprobs, _ = ray.get(rollouts_obj)
595 |         if not prompt_logprobs: return 1000.0, []
596 |         
597 |         current_prompt_logprobs = prompt_logprobs[0]
598 |         
599 |         # This complex logic is used to find the exact tokens corresponding to the reference answer.
600 |         offsets = self.tokenizer(self.posterior_prefix, return_offsets_mapping=True).offset_mapping 
601 |         start_char_index = len(self.prefix_before_answer)
602 |         end_char_index = start_char_index + len(self.pred_answer)
603 | 
604 |         # Find the number of tokens to count backward from the end of the prompt to find the answer's start.
605 |         tokens_to_go_back = 0
606 |         for i, (start, end) in enumerate(offsets[::-1]):
607 |             if start <= start_char_index:
608 |                 tokens_to_go_back = i + 1
609 |                 break
610 |         
611 |         # Find how many tokens the reference answer spans.
612 |         answer_token_span = 0
613 |         for j, (start, end) in enumerate(offsets[-tokens_to_go_back:]):
614 |             if end >= end_char_index:
615 |                 answer_token_span = j + 1
616 |                 break
617 |         
618 |         answer_logps = []
619 |         # Sum the log probabilities of the tokens that make up the reference answer.
620 |         logprob_slice = current_prompt_logprobs[-tokens_to_go_back : -tokens_to_go_back + answer_token_span]
621 |         for logp_dict in logprob_slice: 
622 |             # The dictionary may have multiple keys; the first one corresponds to the prompt token.
623 |             major_key = list(logp_dict)[0]
624 |             logp = logp_dict[major_key]['logprob']
625 |             answer_logps.append(np.clip(logp, -2.0, 0.0))
626 |             
627 |         # Calculate the negative mean log probability (log perplexity). A lower value is better.
628 |         log_perplexity = -np.mean(answer_logps) if answer_logps else 1000.0
629 |         
630 |         # For debugging, gather the log probability info for tokens surrounding the answer.
631 |         debug_slice = current_prompt_logprobs[-tokens_to_go_back-2 : -tokens_to_go_back + answer_token_span + 2]
632 |         debug_logprob_info = [logp_dict[list(logp_dict)[0]] for logp_dict in debug_slice]
633 |         return log_perplexity, debug_logprob_info
634 |     
635 | def direct_rollout(node, prefix, n_sample=1, role=None, log=False):
636 |     
637 |     role, sysp, in_keys = ('standard_inference' if role is None else role), default_sys, ['q', 'ref'] 
638 |     node.memory['gen_role'] = role
639 |     
640 |     inputs = [node.memory[k] for k in in_keys]
641 |     tok, model = lm_tokenizer, lm_model
642 |     node.memory['generator'] = model.model_name
643 |     inp = [inputs]
644 |     # print(f"===> {role}: {n_sample} for {len(inp)} queries")
645 |     rollouts_obj, real_input_prompts = get_model_output(role, sysp, inp, tok, model, prefix, False, n_sample, is_greedy=False)
646 |     rollout_texts, rollout_logps, rollout_offsets = ray.get(rollouts_obj)
647 |     del rollouts_obj
648 |     return rollout_texts, rollout_logps, rollout_offsets, real_input_prompts
649 | 
650 | 
651 | 
652 | @ray.remote
653 | def process_item(
654 |     item: Dict[str, Any],
655 |     file_prefix: str,
656 |     rank: int,
657 |     n_sample: int, 
658 |     configs=dict()
659 | ):
660 |     uid = item['extra_info']['index']
661 |     output_fname = f"{file_prefix}_{uid}_rk{rank}"
662 |     # if glob(f"{output_fname}*"):
663 |     # for fp in glob(f"{file_prefix}_{uid}*"):
664 |     for fp in glob(f"{output_fname}*"):
665 |         try: 
666 |             tmp = json.load(open(fp))
667 |             if "alist" in tmp:
668 |                 print(f"Skipping existing item: {uid}")
669 |                 return True
670 |         except Exception as e:
671 |             print(e)
672 |             print(f'wrong loading {fp}')
673 |             continue
674 |     
675 |     stop_thresh = configs['processing']['stop_thresh']
676 |     max_step = configs['processing']['max_step']
677 |     num_expansion = configs['processing']['num_expansion']
678 |     q = item['question']
679 |     is_chinese = contains_chinese(q)
680 |     has_think = "</think>" in item['solution']
681 |     if has_think:
682 |         ref = item['solution'].split('</think>')[-1].strip() 
683 |     else:
684 |         ref = item['solution']
685 |     node = Node(ref=ref, raw_q=item['question'], info={'uid': item['extra_info']['index'], 'old_solution': item['solution'] if has_think else None, 'is_chinese': is_chinese})
686 |     node.memory.update({k:v for k,v in item.items() if not isinstance(v, np.ndarray) if k not in {'solution'}})
687 |     
688 |     fname = output_fname
689 | 
690 |     tok, model = lm_tokenizer, lm_model
691 | 
692 |     # 1. Initial Rollout
693 |     # n_sample = 1
694 |     generation_role = 'initial_thinking_'+('cn' if is_chinese else 'en')
695 |     if is_chinese:
696 |         if np.random.uniform()>0.5: pre_trigger = "好的"
697 |         else: pre_trigger = "嗯"
698 |     else:
699 |         if np.random.uniform()>0.8: pre_trigger = "Okay, I am given"
700 |         elif np.random.uniform()>0.4: pre_trigger = "Alright, the user"
701 |         else: pre_trigger = "Alright"
702 |     rollout_texts, rollout_logps, rollout_offsets, real_input_prompts = direct_rollout(node, "<think>\n"+pre_trigger, n_sample, role=generation_role, log=True)
703 |     # print(rollout_texts[0])
704 |     outcomes = []
705 |     expanded_prompts = [pp for pp in real_input_prompts for _ in range(n_sample)]
706 |     flag = True
707 |     for roll, inpprompt, token_logps, token_to_text_offsets in zip(rollout_texts, expanded_prompts, rollout_logps, rollout_offsets):
708 |         ntoken = len(token_logps)
709 |         if ntoken<100: 
710 |             print("num token too short, skip")
711 |             return False
712 |         info = dict(ntokens=ntoken,)
713 |         
714 |         # separate thinking and code answer
715 |         roll = pre_trigger + roll
716 |         thinking = roll.split('</think>')[0]
717 |         answer_code = roll.split('<answer>')[-1].split('</answer>')[-1].strip()
718 | 
719 |         # breakdown thinking to steps 
720 |         aa,bb,thinking_segments = breakdown_steps(thinking)
721 |         if len(thinking_segments)==1:
722 |             return False
723 |         # get PPL of initial rollout
724 |         role, sysp = 'standard_inference_'+('cn' if is_chinese else 'en'), default_sys 
725 |         inp = [[node.memory['q']]]
726 |         manager = PosteriorManager(role, sysp, inp, post_tokenizer, post_model, node.memory['ref'])
727 | 
728 |         posterior_prefix1, posterior_prefix = manager.prepare(think_prefix, thinking)
729 |         rollouts_obj, real_input_prompts = manager.submit(posterior_prefix)
730 |         noreplace_log_ppl = manager.compute(rollouts_obj)
731 |         ppl = noreplace_log_ppl
732 |         if ppl[0]<stop_thresh:
733 |             flag = False
734 |             print('early stopping')
735 | 
736 |             
737 |         # save info
738 |         info.update(dict(initial_response=roll, 
739 |                             answer=answer_code,
740 |                             initial_ppl=ppl, 
741 |                             generator=model.model_name,
742 |                             thinking_segments=thinking_segments,
743 |                         #  rawinput=inpprompt, 
744 |                     ))
745 |     
746 |         outcomes.append(info)
747 | 
748 |     if flag: 
749 |         # 2. iterative refinement for each response
750 |         refinement_processor = RefinementProcessor(
751 |             node, tok, model, post_tokenizer, post_model,
752 |             stop_threshold=stop_thresh, 
753 |             max_steps=max_step,
754 |             num_expansion=num_expansion
755 |         )
756 |         
757 |         updated_results = []
758 |         for info in outcomes:
759 |             refined_info = refinement_processor.run(info)
760 |             updated_results.append(refined_info if refined_info is not None else info)
761 |         
762 |         node.memory['alist'] = updated_results
763 |     
764 |         flag = False 
765 |     else:
766 |         node.memory['alist'] = outcomes
767 | 
768 |     result_file = f"{fname}_meta.jsonl"
769 |     with open(result_file, "w") as file:
770 |         line = json.dumps(node.memory,indent=2,ensure_ascii=False)
771 | 
772 |         file.write(line + "\n")
773 |     print(f'dumped to {result_file}')
774 |       
775 |     return flag
776 |         
777 | 
778 | def main(args: argparse.Namespace):
779 |     """Main function to load data, initialize models, and run the processing loop."""
780 |     config = load_config(args.config_file)
781 |     file_prefix = config["output"]["file_prefix"]
782 |     folder = os.path.sep.join(file_prefix.split(os.path.sep)[:-1])
783 |     os.makedirs(folder, exist_ok=True)
784 |     num_rollouts = config["processing"]["num_rollouts"]
785 | 
786 |     # Create output directory
787 |     os.makedirs(os.path.dirname(config["output"]["file_prefix"]), exist_ok=True)
788 |     
789 |     # Initialize Ray
790 |     ray.init(num_cpus=args.num_cpus)
791 | 
792 |     # Load models and tokenizers
793 |     print("Loading models and tokenizers...")
794 |     global lm_tokenizer, lm_model, post_model, post_tokenizer
795 |     config["model"]["model_args"]["port"] = args.port
796 |     config["judge_model"]["model_args"]["port"] = f"2{args.port}"
797 |     config["model"]["model_name"] = args.model 
798 |     if args.posterior_model!="":
799 |         config["judge_model"]["model_name"] = args.posterior_model 
800 |     lm_tokenizer = transformers.AutoTokenizer.from_pretrained(
801 |         config["model"]["model_name"],
802 |         use_fast=True,
803 |         padding_side="right",
804 |         truncation_side="right",
805 |     )
806 |     lm_model = LM(
807 |         model_type=config["model"]["model_type"],
808 |         model_name=config["model"]["model_name"],
809 |         model_url=None, # lm_url,
810 |         num_rollouts=num_rollouts,
811 |         tokenizer=lm_tokenizer,
812 |         **config["model"]["model_args"],
813 |     )
814 |     if config["judge_model"]["use"]:
815 |         post_tokenizer = transformers.AutoTokenizer.from_pretrained(
816 |             config["judge_model"]["model_name"],
817 |             use_fast=True,
818 |             padding_side="right",
819 |             truncation_side="right",
820 |         )
821 |         post_model = LM(
822 |             model_type=config["judge_model"]["model_type"],
823 |             model_name=config["judge_model"]["model_name"],
824 |             model_url=None, # lm_url,
825 |             num_rollouts=num_rollouts,
826 |             tokenizer=lm_tokenizer,
827 |             **config["judge_model"]["model_args"],
828 |         )
829 |     else:
830 |         post_tokenizer, post_model = lm_tokenizer, lm_model
831 |     path = config['input']['file_path']
832 |     if path.endswith("parquet"):
833 |         df = pd.read_parquet(path)
834 |         data = df.to_dict('records')
835 |         
836 |     else:
837 |         data = json.load(open(path))
838 |         for entry in data:
839 |             entry['extra_info'] = {'index': entry['index']}
840 |     
841 |     num_each = len(data) // args.total_ranks + 1
842 |     rank = args.rank%args.total_ranks
843 |     rank_data = data[rank * num_each : (rank + 1) * num_each]
844 |     # np.random.shuffle(rank_data)
845 |     print(f"Rank {args.rank}/{args.total_ranks}: Processing {len(rank_data)} items.")
846 |     st = time.time()
847 |     num_para = 400
848 |     for idx in range(0, len(rank_data), num_para):
849 |         print(f"starting batch: {idx} ")
850 |         rank_data_ = rank_data[idx:idx+num_para]
851 |         task_futures = [
852 |             process_item.remote(
853 |                     item,
854 |                     file_prefix,
855 |                     args.rank,
856 |                     num_rollouts,
857 |                     configs=config
858 |                 )
859 |             for item in rank_data_
860 |         ]
861 | 
862 |         # Wait for all tasks to complete
863 |         ray.get(task_futures)
864 |         print(f"Total time elapsed for {num_para} queries: {(time.time() - st) / 60:.2f} minutes.")
865 |     
866 |     ray.shutdown()
867 |     
868 |     logging.info("Finished processing the JSON file.")
869 |     
870 | 
871 | if __name__ == "__main__":
872 |     parser = argparse.ArgumentParser(description="Run language model generation and refinement pipeline.")
873 |     parser.add_argument("--config_file", type=str, help="Path to the YAML configuration file.")
874 |     parser.add_argument("--model", type=str, required=True, help="Path to generator.")
875 |     parser.add_argument("--posterior_model", type=str, default="", help="Path to generator.")
876 |     parser.add_argument("--port", type=int, required=True, help="Port.")
877 |     parser.add_argument("--rank", type=int, required=True, help="Rank of the current process.")
878 |     parser.add_argument("--total-ranks", type=int, required=True, help="Total number of parallel processes.")
879 |     parser.add_argument("--num-cpus", type=int, default=32, help="Number of CPUs to allocate for Ray.")
880 |     
881 |     args = parser.parse_args()
882 |     main(args)
883 | 


--------------------------------------------------------------------------------