├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── README_zh-CN.md
├── assets
├── intro.jpeg
├── logo.png
└── result.png
├── examples
├── data_preprocess
│ ├── am_general.py
│ ├── full_hh_rlhf.py
│ └── math.py
├── ppo
│ ├── llama3-8b_general.sh
│ ├── llama3-8b_hh-rlhf.sh
│ ├── llama3-8b_math.sh
│ ├── qwen2_5-7b_general.sh
│ ├── qwen2_5-7b_hh-rlhf.sh
│ ├── qwen2_5-7b_math.sh
│ ├── qwen3-8b_general.sh
│ ├── qwen3-8b_hh-rlhf.sh
│ └── qwen3-8b_math.sh
└── xtuner_configs
│ ├── POLAR_1_8B_full_varlenattn_custom_dataset.py
│ └── POLAR_7B_full_varlenattn_custom_dataset.py
└── src
└── polar
├── __init__.py
└── reward_func.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[codz]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py.cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # UV
98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | #uv.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | #poetry.toml
110 |
111 | # pdm
112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115 | #pdm.lock
116 | #pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 |
120 | # pixi
121 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122 | #pixi.lock
123 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124 | # in the .venv directory. It is recommended not to include this directory in version control.
125 | .pixi
126 |
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 |
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 |
134 | # SageMath parsed files
135 | *.sage.py
136 |
137 | # Environments
138 | .env
139 | .envrc
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 |
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 |
151 | # Rope project settings
152 | .ropeproject
153 |
154 | # mkdocs documentation
155 | /site
156 |
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 |
162 | # Pyre type checker
163 | .pyre/
164 |
165 | # pytype static type analyzer
166 | .pytype/
167 |
168 | # Cython debug symbols
169 | cython_debug/
170 |
171 | # PyCharm
172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174 | # and can be added to the global gitignore or merged into this file. For a more nuclear
175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176 | #.idea/
177 |
178 | # Abstra
179 | # Abstra is an AI-powered process automation framework.
180 | # Ignore directories containing user credentials, local state, and settings.
181 | # Learn more at https://abstra.io/docs
182 | .abstra/
183 |
184 | # Visual Studio Code
185 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187 | # and can be added to the global gitignore or merged into this file. However, if you prefer,
188 | # you could uncomment the following to ignore the entire vscode folder
189 | # .vscode/
190 |
191 | # Ruff stuff:
192 | .ruff_cache/
193 |
194 | # PyPI configuration file
195 | .pypirc
196 |
197 | # Cursor
198 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200 | # refer to https://docs.cursor.com/context/ignore-files
201 | .cursorignore
202 | .cursorindexingignore
203 |
204 | # Marimo
205 | marimo/_static/
206 | marimo/_lsp/
207 | __marimo__/
208 |
209 | # outputs
210 | outputs/
211 |
212 | # vscode
213 | .vscode/
214 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "verl"]
2 | path = verl
3 | url = https://github.com/volcengine/verl
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2023-2025 Shanghai AI Laboratory
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
204 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
205 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
206 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
207 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
208 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
209 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
210 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
211 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
212 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
213 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |

4 |
5 |
6 | [](./LICENSE)
7 | [](https://github.com/InternLM/xtuner/)
8 | [](https://github.com/InternLM/lmdeploy/)
9 | [](https://github.com/sgl-project/sglang/)
10 | [](https://github.com/vllm-project/vllm/)
11 | [](https://github.com/volcengine/verl)
12 |
13 |
14 | [🤗 HuggingFace](https://huggingface.co/collections/internlm/polar-68693f829d2e83ac5e6e124a) |
15 | [🤖 ModelScope](https://www.modelscope.cn/organization/Shanghai_AI_Laboratory) |
16 | [📜 Paper](https://arxiv.org/abs/2507.05197)
17 |
18 |
19 | [English](./README.md) |
20 | [简体中文](./README_zh-CN.md)
21 |
22 |
23 |
24 | # Latest News 🎉
25 |
26 | - **[2025/09]** Our POLAR paper has been accepted by Neurips 2025.
27 | - **[2025/09]** POLAR now supports RFT (Reinforcement Fine-tuning) training using VERL.
28 |
29 |
30 | # Introduction
31 |
32 | POLAR represents a significant breakthrough in scalar-based reward models achieved through large-scale pre-training. It leverages the innovative **POL**icy Discrimin**A**tive Lea**R**ning (**POLAR**) paradigm——a scalable, high-level optimization objective——to effectively discriminate between policies using a large-scale synthetic corpora. Following pre-training, POLAR RMs are fine-tuned with minimal preference data, rapidly aligning with human preferences. Key features of POLAR include:
33 |
34 | * **Innovative Pre-training Paradigm:** POLAR trains a reward model to discern identical policies and discriminate different ones. Unlike traditional reward modeling methods relying on absolute preferences, POLAR captures the relative difference between two policies, which is a scalable, high-level optimization objective suitable for modeling generic ranking relationships.
35 |
36 | * **Tailored for Reinforcement Fine-tuning:** POLAR assigns rewards to LLM trajectories based on given references, perfectly aligning with the Reinforcement Fine-tuning (RFT) framework. POLAR provides a promising solution for applying RFT in generic scenarios.
37 |
38 | * **Superior Performance and Generalization:** POLAR achieves state-of-the-art results on downstream reinforcement learning tasks, consistently delivering accurate and reliable reward signals that generalize effectively to unseen scenarios and significantly reducing reward hacking.
39 |
40 | * **Easy to Customize:** Pre-trained checkpoints of POLAR are available, enabling researchers to conveniently fine-tune the RM for various customized scenarios, thus facilitating straightforward adaptation and expansion tailored to specific applications and experimental requirements.
41 |
42 |
43 | 
44 |
45 |
46 | # Model Zoo
47 |
48 | We release POLAR reward models in sizes of 1.8B and 7B parameters. The "base" models (POLAR-1.8B-Base and POLAR-7B-Base) refer to pre-trained-only checkpoints, ideal for customized fine-tuning according to specific preferences. The "ready-to-use" checkpoints (POLAR-1.8B and POLAR-7B) have been already fine-tuned on general preference data, making them suitable for immediate use in most scenarios.
49 |
50 | | Model | Transformers(HF) | ModelScope(HF) |
51 | | -------------------------- | ------------------------------------------ | ---------------------------------------- |
52 | | **POLAR-1.8B-Base** | [🤗 POLAR-1_8B-Base](https://huggingface.co/internlm/POLAR-1_8B-Base) | [🤖 POLAR-1_8B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B-Base/summary) |
53 | | **POLAR-1.8B** | [🤗 POLAR-1_8B](https://huggingface.co/internlm/POLAR-1_8B) | [🤖 POLAR-1_8B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B/summary) |
54 | | **POLAR-7B-Base** | [🤗 POLAR-7B-Base](https://huggingface.co/internlm/POLAR-7B-Base) | [🤖 POLAR-7B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B-Base/summary) |
55 | | **POLAR-7B** | [🤗 POLAR-7B](https://huggingface.co/internlm/POLAR-7B) | [🤖 POLAR-7B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B/summary) |
56 |
57 |
58 | # Performance
59 |
60 | We conducted a comprehensive evaluation of POLAR via the Proximal Policy Optimization (PPO) algorithm. We evaluate the downstream RL performances of four different policy models using [OpenCompass](https://github.com/internLM/OpenCompass/). More details are available in our [Paper](https://arxiv.org/abs/2507.05197).
61 |
62 | 
63 |
64 | # Quick Start
65 |
66 | This repository provides a `RewardModelClient` class (`src/polar/reward_func.py`) for querying reward values from a remote POLAR server. It handles input encoding, communication with different backends (sglang, vllm, lmdeploy), and returns the reward scores.
67 |
68 | ```python
69 | from src.polar import RewardModelClient
70 | ```
71 |
72 | Optionally, you can also use [XTuner](https://github.com/InternLM/xtuner)’s implementation by installing XTuner and importing the class from XTuner.
73 |
74 | ```python
75 | from xtuner.utils import RewardModelClient
76 | ```
77 |
78 | For XTuner installation instructions, see the [Fine-tune](#fine-tune) section below.
79 |
80 | ## Inference
81 |
82 | We support reward inference through [lmdeploy](https://github.com/InternLM/lmdeploy/), [sglang](https://github.com/sgl-project/sglang/), and [vllm](https://github.com/vllm-project/vllm/). We recommend setting up a virtual environment with conda when using these inference engines to prevent potential dependency conflicts.
83 |
84 | ### Data format
85 |
86 | Unlike traditional reward models, POLAR requires an additional reference trajectory as a demonstration and evaluate candidate trajectories by measuring their consistency with the provided reference.
87 |
88 | ```python
89 | data = [
90 | {
91 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
92 | "reference": [{"role": "assistant", "content": "Beijing."}],
93 | "output": [{"role": "assistant", "content": "Beijing."}]
94 | },
95 | {
96 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
97 | "reference": [{"role": "assistant", "content": "Beijing."}],
98 | "output": [{"role": "assistant", "content": "Shanghai."}]
99 | }
100 | ]
101 | ```
102 |
103 | ### Inference with transformers
104 |
105 | #### Reward request
106 | To load the POLAR model using transformers, use the following code to get rewards:
107 |
108 | ```python
109 | from transformers import AutoModel, AutoTokenizer
110 | from src.polar import RewardModelClient
111 | # from xtuner.utils import RewardModelClient
112 |
113 | model_name = 'internlm/POLAR-7B'
114 |
115 | model = AutoModel.from_pretrained(
116 | model_name,
117 | device_map="cuda",
118 | trust_remote_code=True
119 | )
120 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
121 |
122 | client = RewardModelClient(model_name)
123 | encoded_data = client.encode(data)
124 | batch = tokenizer(encoded_data, return_tensors='pt', padding=True).to('cuda')
125 | outputs = model(**batch)
126 | rewards = outputs[0].squeeze(-1).cpu().tolist()
127 | print(rewards)
128 | # [-0.5702977776527405, -11.030370712280273] for previous example data
129 | ```
130 |
131 | ### Inference with lmdeploy
132 |
133 | [LMDeploy](https://github.com/InternLM/lmdeploy) is a toolkit for compressing, deploying, and serving LLMs.
134 |
135 | #### Requirements
136 |
137 | - lmdeploy >= 0.9.1
138 |
139 | #### Server Launch
140 |
141 | ```bash
142 | lmdeploy serve api_server internlm/POLAR-7B --backend pytorch --server-port 30000
143 | ```
144 | #### Client Request
145 |
146 | ```python
147 | from src.polar import RewardModelClient
148 | # from xtuner.utils import RewardModelClient
149 |
150 | client = RewardModelClient("internlm/POLAR-7B",
151 | server_type="lmdeploy",
152 | server_address="127.0.0.1:30000")
153 |
154 | # Request rewards directly
155 | rewards = client(data)
156 | print(rewards)
157 |
158 | # First encode data and then get rewards via the request function.
159 | encoded_data = client.encode(data)
160 | rewards = client.lmdeploy_request_reward(encoded_data)
161 | print(rewards)
162 | ```
163 |
164 | ### Inference with sglang
165 |
166 | #### Requirements
167 |
168 | - 0.4.3.post4 <= sglang <= 0.4.4.post1
169 |
170 | #### Server Launch
171 |
172 | ```bash
173 | python3 -m sglang.launch_server --model internlm/POLAR-7B --trust-remote-code --is-embedding --dp 4 --tp 2 --mem-fraction-static 0.9 --port 30000
174 | ```
175 |
176 | #### Client Request
177 |
178 | ```python
179 | from src.polar import RewardModelClient
180 | # from xtuner.utils import RewardModelClient
181 |
182 | client = RewardModelClient("internlm/POLAR-7B",
183 | server_type="sglang",
184 | server_address="127.0.0.1:30000")
185 |
186 | # Request rewards directly
187 | rewards = client(data)
188 | print(rewards)
189 |
190 | # First encode data and then get rewards via the request function.
191 | encoded_data = client.encode(data)
192 | rewards = client.sglang_request_reward(encoded_data)
193 | print(rewards)
194 | ```
195 |
196 | ### Inference with vllm
197 |
198 | #### Requirements
199 |
200 | - vllm >= 0.8.0
201 |
202 | #### Server Launch
203 |
204 | ```bash
205 | vllm serve internlm/POLAR-7B --task=reward --trust-remote-code --tensor-parallel-size=2 --port 30000
206 | ```
207 |
208 | #### Client Request
209 |
210 | ```python
211 | from src.polar import RewardModelClient
212 | # from xtuner.utils import RewardModelClient
213 |
214 | client = RewardModelClient("internlm/POLAR-7B",
215 | server_type="vllm",
216 | server_address="127.0.0.1:30000")
217 |
218 | # Request rewards directly
219 | rewards = client(data)
220 | print(rewards)
221 |
222 | # First encode data and then get rewards via the request function.
223 | encoded_data = client.encode(data)
224 | rewards = client.vllm_request_reward(encoded_data)
225 | print(rewards)
226 | ```
227 |
228 | ## RFT with VERL
229 |
230 | POLAR can be easily integrated into various reinforcement learning frameworks. This repository provides an example showing how to use [VERL](https://github.com/volcengine/verl) for reinforcement fine-tuning (RFT) with POLAR reward models.
231 |
232 | ### Environment Setup
233 |
234 | Please refer to the [VERL official installation guide](https://github.com/volcengine/verl) for detailed environment setup instructions.
235 |
236 | > **Note**: For training Qwen2.5 series, we recommend using the inference backend **vLLM 0.8.3** and **Transformers 4.50.3** for optimal performance. A higher version of transformers may cause training instability of Qwen2.5 series.
237 |
238 | ### Data Format
239 |
240 | Training data should be in Parquet format with the following structure:
241 | ```python
242 | {
243 | "data_source": "dataset_name",
244 | "prompt": [{"role": "user", "content": "..."}, ...],
245 | "ability": "alility_type",
246 | "reward_model": {
247 | "style": "polar",
248 | "ground_truth": [{"role": "assistant", "content": "..."}]
249 | }
250 | "extra_info": {
251 | # The same as prompt. The purpose is for compatibible usage of verl and polar.
252 | "prompt": [{"role": "user", "content": "..."}, ...],
253 | }
254 | }
255 | ```
256 |
257 | ### Training steps
258 |
259 | - **Step 1:** POLAR Deployment
260 |
261 | Deploy the POLAR reward model following the above [Inference](#inference) instructions. Update the server configuration in `src/polar/reward_func.py`:
262 |
263 | ```python
264 | # Config reward model server
265 | ADDRESS = "your_server_ip:port" # Modify according to your server address
266 | SERVER_TYPE = "sglang" # Options: "sglang", "vllm", "lmdeploy"
267 | MODEL_PATH = "internlm/POLAR-7B"
268 | ```
269 |
270 | - **Step 2:** Data Preparation
271 |
272 | Prepare your training data in Parquet format. You can use the provided data preprocessing scripts:
273 |
274 | ```bash
275 | # Example: Process HH-RLHF dataset
276 | python examples/data_preprocess/full_hh_rlhf.py --local_dir ~/data/hh_rlhf
277 | ```
278 |
279 | - **Step 3:** Configure Training Script
280 |
281 | An example of training script: `examples/ppo/qwen2_5-7b_hh-rlhf.sh`.
282 |
283 | - **Step 4:** Run Training
284 |
285 | ```bash
286 | cd verl
287 | bash ../examples/ppo/qwen2_5-7b_hh-rlhf.sh
288 | ```
289 |
290 | ### Results
291 |
292 | Here we show the RFT results of Qwen3-8B trained by our [official configs](https://github.com/InternLM/POLAR/blob/main/examples/ppo/qwen3-8b_general.sh), with the public [AM-DeepSeek-R1-0528-Distilled](https://huggingface.co/datasets/a-m-team/AM-DeepSeek-R1-0528-Distilled) dataset. We use [OpenCompass](https://github.com/internLM/OpenCompass/) for evaluation.
293 |
294 | | Benchmark | Qwen3-8B w. thinking | Qwen3-8B w. thinking (RFT) |
295 | | --- | ---- | ---- |
296 | | alignment_bench | 7.04 | 7.48 |
297 | | alpaca_eval | 87.20 | 95.40 |
298 | | arenahard | 83.15 | 89.45 |
299 | | followbench | 0.93 | 0.95 |
300 | | mtbench | 8.73 | 8.78 |
301 | | wildbench | 58.43 | 72.09 |
302 | | mmlu | 86.06 | 86.58 |
303 | | mmlu_pro | 73.66 | 75.19 |
304 | | cmmlu | 82.72 | 83.07 |
305 | | bbeh | 29.56 | 33.30 |
306 | | korbench | 73.16 | 75.00 |
307 | | gpqa | 61.05 | 63.07 |
308 | | supergpqa | 47.82 | 49.67 |
309 | | olympiadbench | 69.90 | 70.45 |
310 | | aime2024 | 75.52 | 75.83 |
311 | | aime2025 | 67.50 | 68.71 |
312 | | mbpp | 83.66 | 93.00 |
313 | | lcb-code | 46.86 | 48.57 |
314 |
315 |
316 | ## Fine-tune
317 |
318 | You could employ the latest [xtuner](https://github.com/InternLM/xtuner) to fine-tune POLAR. Xtuner is an efficient, flexible and full-featured toolkit for fine-tuning LLMs.
319 |
320 | - It is recommended to build a Python-3.10 virtual environment using conda
321 |
322 | ```bash
323 | conda create --name xtuner-env python=3.10 -y
324 | conda activate xtuner-env
325 | ```
326 |
327 | - Install xtuner via pip
328 |
329 | ```shell
330 | pip install 'xtuner[deepspeed]'==0.2.0
331 | ```
332 |
333 | - Install xtuner from the latest source code
334 |
335 | ```shell
336 | pip install 'git+https://github.com/InternLM/xtuner.git@main#egg=xtuner[deepspeed]'
337 | ```
338 |
339 | ### Requirements
340 |
341 | - flash_attn
342 | - tensorboard
343 |
344 | ### Data format
345 |
346 | Unlike traditional reward models, POLAR requires an additional reference trajectory as a demonstration during fine-tuning, along with a chosen trajectory and a rejected trajectory. You can construct your fine-tuning data in a `train.jsonl` file, formatted as follows:
347 |
348 | ```json
349 | {
350 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
351 | "reference": [{"role": "assistant", "content": "Beijing."}],
352 | "chosen": [{"role": "assistant", "content": "Beijing."}],
353 | "rejected": [{"role": "assistant", "content": "Shanghai."}]
354 | }
355 | ```
356 |
357 | ### Training steps
358 |
359 | - **Step 0:** Prepare the config. We provide examplar ready-to-use configs [here](./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py). If the provided configs cannot meet the requirements, please copy the provided config and do modification following the [xtuner guideline](https://github.com/InternLM/xtuner/blob/main/docs/en/get_started/quickstart.md). For more details of reward model training settings, please see the xtuner [reward model guideline](https://github.com/InternLM/xtuner/blob/main/docs/en/reward_model/modify_settings.md).
360 |
361 | - **Step 1:** Start fine-tuning.
362 |
363 | ```shell
364 | xtuner train ${CONFIG_FILE_PATH}
365 | ```
366 |
367 | For example, you can start the fine-tuning of POLAR-7B-Base by
368 |
369 | ```shell
370 | # On a single GPU
371 | xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2
372 |
373 | # On multiple GPUs
374 | NPROC_PER_NODE=${GPU_NUM} xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2
375 | ```
376 |
377 | Here, `--deepspeed` means using [DeepSpeed](https://github.com/microsoft/DeepSpeed) to optimize the training. Xtuner comes with several integrated strategies including ZeRO-1, ZeRO-2, and ZeRO-3. If you wish to disable this feature, simply remove this argument.
378 |
379 | - **Step 2:** Convert the saved PTH model (if using DeepSpeed, it will be a directory) to Hugging Face model, by
380 |
381 | ```shell
382 | xtuner convert pth_to_hf ${CONFIG_FILE_PATH} ${PTH} ${SAVE_PATH}
383 | ```
384 |
385 | # Examples
386 |
387 | ## Closed-ended questions
388 |
389 | ```python
390 | from src.polar import RewardModelClient
391 | # from xtuner.utils import RewardModelClient
392 |
393 | prompt = "How many 'r's are there in the word 'strawberry'?"
394 | reference = "There are 3 'r's in the word 'strawberry'. Here's how we can count them: 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. So, the answer is 3."
395 | outputs = [
396 | # Same as the reference response.
397 | "There are 3 'r's in the word 'strawberry'. Here's how we can count them: 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. So, the answer is 3.",
398 | # Correct answer with correct thoughts.
399 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is three.",
400 | # Wrong answer with wrong thoughts.
401 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is two.",
402 | # Wrong answer with correct thoughts.
403 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is two.",
404 | # Correct answer with wrong thoughts.
405 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is three.",
406 | # Correct answer without thoughts.
407 | "There are 3 'r's in the word 'strawberry'.",
408 | # Wrong answer without thoughts.
409 | "There are 2 'r's in the word 'strawberry'.",
410 | ]
411 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs]
412 |
413 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000")
414 | rewards = client(data)
415 |
416 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True)
417 |
418 | for output, reward in sorted_res:
419 | print(f"Output: {output}\nReward: {reward}\n")
420 | ```
421 |
422 | ```txt
423 | Output: There are 3 'r's in the word 'strawberry'. Here's how we can count them: 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. So, the answer is 3.
424 | Reward: 0.054595947265625
425 |
426 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is three.
427 | Reward: -2.005859375
428 |
429 | Output: There are 3 'r's in the word 'strawberry'.
430 | Reward: -6.70703125
431 |
432 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is three.
433 | Reward: -7.10546875
434 |
435 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is two.
436 | Reward: -7.1328125
437 |
438 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is two.
439 | Reward: -8.46875
440 |
441 | Output: There are 2 'r's in the word 'strawberry'.
442 | Reward: -10.8203125
443 | ```
444 |
445 | ## Open-ended questions
446 | ```python
447 | from src.polar import RewardModelClient
448 | # from xtuner.utils import RewardModelClient
449 |
450 | prompt = "Summarize the first book of Frank Herbert’s Dune in one witty short sentence."
451 | reference = "Royal teen discovers that life’s a beach—minus the ocean, plus spice, giant sandworms and deadly politics."
452 | outputs = [
453 | # Same as the reference response.
454 | "Royal teen discovers that life’s a beach—minus the ocean, plus spice, giant sandworms and deadly politics.",
455 | # Closely resembles the reference response but includes factual errors.
456 | "Royal teen discovers that life’s a beach—minus the ocean, plus magic, dark wizards and deadly politics.",
457 | # A distinct yet concise and witty summary that draws analogies from other dramas—markedly different from the reference response.
458 | "Young noble’s move to desert planet turns into galactic Game of Thrones with fewer dragons, more worms.",
459 | # A concise summary, but lacking wit—fails to meet the requirement.
460 | "A noble family’s fall sparks a young heir’s rise as a leader on a harsh desert planet governed by prophecy and survival.",
461 | # A witty summary, but overly long—fails to meet the requirement.
462 | "Paul Atreides loses his father, gains prophetic powers, learns to ride a sandworm, leads a holy war, and discovers that being the chosen one comes with a lot of blood, sand, and questionable decisions.",
463 | # A concise and witty summary that draws from multiple Dune books rather than just the first—fails to follow the instruction.
464 | "Boy gets planet, becomes god, loses soul — family drama ensues across galaxies."
465 | ]
466 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs]
467 |
468 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000")
469 | rewards = client(data)
470 |
471 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True)
472 |
473 | for output, reward in sorted_res:
474 | print(f"Output: {output}\nReward: {reward}\n")
475 | ```
476 |
477 | ```txt
478 | Output: Royal teen discovers that life’s a beach—minus the ocean, plus spice, giant sandworms and deadly politics.
479 | Reward: 0.466552734375
480 |
481 | Output: Young noble’s move to desert planet turns into galactic Game of Thrones with fewer dragons, more worms.
482 | Reward: -6.91796875
483 |
484 | Output: Royal teen discovers that life’s a beach—minus the ocean, plus magic, dark wizards and deadly politics.
485 | Reward: -7.70703125
486 |
487 | Output: Paul Atreides loses his father, gains prophetic powers, learns to ride a sandworm, leads a holy war, and discovers that being the chosen one comes with a lot of blood, sand, and questionable decisions.
488 | Reward: -8.4296875
489 |
490 | Output: A noble family’s fall sparks a young heir’s rise as a leader on a harsh desert planet governed by prophecy and survival.
491 | Reward: -8.6484375
492 |
493 | Output: Boy gets planet, becomes god, loses soul — family drama ensues across galaxies.
494 | Reward: -10.359375
495 | ```
496 |
497 | # License
498 |
499 | Code and model weights are licensed under Apache-2.0.
500 |
501 | # Citation
502 |
503 | ```
504 | @article{dou2025pretrained,
505 | title={Pre-Trained Policy Discriminators are General Reward Models},
506 | author={Dou, Shihan and Liu, Shichun and Yang, Yuming and Zou, Yicheng and Zhou, Yunhua and Xing, Shuhao and Huang, Chenhao and Ge, Qiming and Song, Demin and Lv, Haijun and others},
507 | journal={arXiv preprint arXiv:2507.05197},
508 | year={2025}
509 | }
510 | ```
511 |
--------------------------------------------------------------------------------
/README_zh-CN.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |

4 |
5 |
6 | [](./LICENSE)
7 | [](https://github.com/InternLM/xtuner/)
8 | [](https://github.com/InternLM/lmdeploy/)
9 | [](https://github.com/sgl-project/sglang/)
10 | [](https://github.com/vllm-project/vllm/)
11 | [](https://github.com/volcengine/verl)
12 |
13 |
14 | [🤗 HuggingFace](https://huggingface.co/collections/internlm/polar-68693f829d2e83ac5e6e124a) |
15 | [🤖 ModelScope](https://www.modelscope.cn/organization/Shanghai_AI_Laboratory) |
16 | [📜 Paper](https://arxiv.org/abs/2507.05197)
17 |
18 |
19 | [English](./README.md) |
20 | [简体中文](./README_zh-CN.md)
21 |
22 |
23 |
24 | # 最新进展 🎉
25 |
26 | - **[2025/09]** POLAR 论文现已被 Neurips 2025 会议接收。
27 | - **[2025/09]** POLAR 现已支持使用 VERL 进行 RFT(强化微调)训练。
28 |
29 |
30 | # 简介
31 |
32 | POLAR 是一个经过大规模预训练的奖励模型,在训练范式和模型性能上取得了重大突破。我们利用全新的策略判别学习方法(Policy Discriminative Learning,POLAR),使用大规模合成语料进行高效扩展预训练,使奖励模型能够有效区分不同的语言模型和策略分布。经过预训练的 POLAR 可通过少量的偏好数据进行微调,以快速对齐人类偏好。POLAR 的主要特点包括:
33 |
34 | * **全新的预训练范式**:POLAR 让奖励模型学会识别相同的策略并区分不同的策略。与传统的依赖绝对偏好的奖励建模方法不同,POLAR 能够学习两个策略之间的相对差异,是一种可扩展的、高层次的优化目标。
35 |
36 | * **专为强化学习微调(RFT)设计:** POLAR 根据给定的参考答案为语言模型的输出打分,完美契合强化学习微调(RFT)框架,为强化学习微调在通用场景的应用提供了一种有效解决方案。
37 |
38 | * **卓越的性能与泛化能力:** POLAR 在下游强化学习任务中展现出领先的水平,可稳定地提供准确可靠的奖励信号。POLAR 具有极强的泛化能力,可有效泛化到分布外场景,并显著减少奖励黑客(Reward Hacking)的现象。
39 |
40 | * **易于定制化:** 我们提供了 POLAR 的预训练权重(POLAR-Base)。研究人员可以根据自身需求,便捷地对其进行微调以适配各种定制化场景。
41 |
42 |

43 |
44 |
45 | # 模型库
46 |
47 | 此次发布的 POLAR 模型参数规模分别为 1.8B 和 7B。**POLAR-1.8B-Base** 和 **POLAR-7B-Base** 是仅经过预训练阶段的权重,适合根据特定需求进行微调。**POLAR-1.8B** 和 **POLAR-7B** 是经过偏好微调的奖励模型,可开箱即用,适用于大部分通用场景。
48 |
49 | | 模型 | Transformers(HF) | ModelScope(HF) |
50 | | -------------------------- | ------------------------------------------ | ---------------------------------------- |
51 | | **POLAR-1.8B-Base** | [🤗 POLAR-1_8B-Base](https://huggingface.co/internlm/POLAR-1_8B-Base) | [🤖 POLAR-1_8B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B-Base/summary) |
52 | | **POLAR-1.8B** | [🤗 POLAR-1_8B](https://huggingface.co/internlm/POLAR-1_8B) | [🤖 POLAR-1_8B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B/summary) |
53 | | **POLAR-7B-Base** | [🤗 POLAR-7B-Base](https://huggingface.co/internlm/POLAR-7B-Base) | [🤖 POLAR-7B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B-Base/summary) |
54 | | **POLAR-7B** | [🤗 POLAR-7B](https://huggingface.co/internlm/POLAR-7B) | [🤖 POLAR-7B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B/summary) |
55 |
56 |
57 | # 性能
58 |
59 | 我们通过 Proximal Policy Optimization(PPO)算法对 POLAR 的使用效果进行了验证,评测了四种语言模型的下游强化学习性能,评测工具是 [OpenCompass](https://github.com/internLM/OpenCompass/) 。详细信息请参阅[论文](https://arxiv.org/abs/2507.05197)。
60 |
61 | 
62 |
63 | # 快速开始
64 |
65 | ## 安装
66 |
67 | 本仓库提供了一个`RewardModelClient`类(`src/polar/reward_func.py`),用于向远程 POLAR 服务请求奖励分数。该类负责对输入的文本进行编码,支持与多种推理后端(sglang、vllm、lmdeploy)进行通信,并返回奖励分数。
68 |
69 | ```python
70 | from src.polar import RewardModelClient
71 | ```
72 |
73 | 您也可以选择使用 [XTuner](https://github.com/InternLM/xtuner) 提供的实现,只需安装 XTuner 并从中导入该类:
74 |
75 | ```python
76 | from xtuner.utils import RewardModelClient
77 | ```
78 |
79 | 关于 XTuner 的安装方法,请参考下方的[偏好微调](#偏好微调)部分。
80 |
81 |
82 | ## 推理
83 |
84 | 我们支持通过 [lmdeploy](https://github.com/InternLM/lmdeploy/)、[sglang](https://github.com/sgl-project/sglang/)、[vllm](https://github.com/vllm-project/vllm/) 对 POLAR 进行推理并获取奖励信号。建议在使用这些推理引擎时,创建 conda 虚拟环境,以避免可能出现的依赖冲突问题。
85 |
86 | ### 数据格式
87 |
88 | 与传统奖励模型不同,POLAR 需要额外的参考答案。POLAR 对模型输出轨迹与参考答案的一致性进行评估,并给出奖励分数。
89 |
90 | ```python
91 | data = [
92 | {
93 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
94 | "reference": [{"role": "assistant", "content": "Beijing."}],
95 | "output": [{"role": "assistant", "content": "Beijing."}]
96 | },
97 | {
98 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
99 | "reference": [{"role": "assistant", "content": "Beijing."}],
100 | "output": [{"role": "assistant", "content": "Shanghai."}]
101 | }
102 | ]
103 | ```
104 |
105 | ### 使用 transformers 进行推理
106 |
107 | #### 示例代码
108 |
109 | ```python
110 | from transformers import AutoModel, AutoTokenizer
111 | from src.polar import RewardModelClient
112 | # from xtuner.utils import RewardModelClient
113 |
114 | model_name = 'internlm/POLAR-7B'
115 |
116 | model = AutoModel.from_pretrained(
117 | model_name,
118 | device_map="cuda",
119 | trust_remote_code=True
120 | )
121 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
122 |
123 | client = RewardModelClient(model_name)
124 | encoded_data = client.encode(data)
125 | batch = tokenizer(encoded_data, return_tensors='pt', padding=True).to('cuda')
126 | outputs = model(**batch)
127 | rewards = outputs[0].squeeze(-1).cpu().tolist()
128 | print(rewards)
129 | # [-0.5702977776527405, -11.030370712280273] for previous example data
130 | ```
131 |
132 | ### 使用 lmdeploy 进行推理
133 |
134 | [LMDeploy](https://github.com/InternLM/lmdeploy) 是一个高效压缩、部署语言模型的工具。
135 |
136 | #### 环境依赖
137 |
138 | - lmdeploy >= 0.9.1
139 |
140 | #### 启动服务端
141 |
142 | ```bash
143 | lmdeploy serve api_server internlm/POLAR-7B --backend pytorch --server-port 30000
144 | ```
145 | #### 客户端请求示例
146 |
147 | ```python
148 | from src.polar import RewardModelClient
149 | # from xtuner.utils import RewardModelClient
150 |
151 | client = RewardModelClient("internlm/POLAR-7B",
152 | server_type="lmdeploy",
153 | server_address="127.0.0.1:30000")
154 |
155 | # Request rewards directly
156 | rewards = client(data)
157 | print(rewards)
158 |
159 | # First encode data and then get rewards via the request function.
160 | encoded_data = client.encode(data)
161 | rewards = client.lmdeploy_request_reward(encoded_data)
162 | print(rewards)
163 | ```
164 |
165 | ### 使用 sglang 进行推理
166 |
167 | #### 环境依赖
168 |
169 | - 0.4.3.post4 <= sglang <= 0.4.4.post1
170 |
171 | #### 启动服务端
172 |
173 | ```bash
174 | python3 -m sglang.launch_server --model internlm/POLAR-7B --trust-remote-code --is-embedding --dp 4 --tp 2 --mem-fraction-static 0.9 --port 30000
175 | ```
176 |
177 | #### 客户端请求示例
178 |
179 | ```python
180 | from src.polar import RewardModelClient
181 | # from xtuner.utils import RewardModelClient
182 |
183 | client = RewardModelClient("internlm/POLAR-7B",
184 | server_type="sglang",
185 | server_address="127.0.0.1:30000")
186 |
187 | # Request rewards directly
188 | rewards = client(data)
189 | print(rewards)
190 |
191 | # First encode data and then get rewards via the request function.
192 | encoded_data = client.encode(data)
193 | rewards = client.sglang_request_reward(encoded_data)
194 | print(rewards)
195 | ```
196 |
197 | ### 使用 vllm 进行推理
198 |
199 | #### 环境依赖
200 |
201 | - vllm >= 0.8.0
202 |
203 | #### 启动服务端
204 |
205 | ```bash
206 | vllm serve internlm/POLAR-7B --task=reward --trust-remote-code --tensor-parallel-size=2 --port 30000
207 | ```
208 |
209 | #### 客户端请求示例
210 |
211 | ```python
212 | from src.polar import RewardModelClient
213 | # from xtuner.utils import RewardModelClient
214 |
215 | client = RewardModelClient("internlm/POLAR-7B",
216 | server_type="vllm",
217 | server_address="127.0.0.1:30000")
218 |
219 | # Request rewards directly
220 | rewards = client(data)
221 | print(rewards)
222 |
223 | # First encode data and then get rewards via the request function.
224 | encoded_data = client.encode(data)
225 | rewards = client.vllm_request_reward(encoded_data)
226 | print(rewards)
227 | ```
228 |
229 | ## 使用 VERL 进行强化微调(RFT)
230 |
231 | POLAR 可以方便地接入各类强化学习训练框架。本仓库提供了一个示例,演示如何结合 [VERL](https://github.com/volcengine/verl) 与 POLAR 奖励模型进行强化微调(RFT)。
232 |
233 | ### 环境配置
234 |
235 | 详细的环境配置方法请参考 [VERL 官方安装指南](https://github.com/volcengine/verl)。
236 |
237 | > **注意**: 在训练 Qwen2.5 系列模型时,推荐使用推理后端 **vLLM 0.8.3** 搭配 **Transformers 4.50.3**,以获得最佳性能。更高版本的 Transformers 可能会导致 Qwen2.5 系列训练不稳定。
238 |
239 | ### 数据格式
240 |
241 | 训练数据应为 Parquet 格式,结构如下:
242 | ```python
243 | {
244 | "data_source": "dataset_name",
245 | "prompt": [{"role": "user", "content": "..."}, ...],
246 | "ability": "alility_type",
247 | "reward_model": {
248 | "style": "polar",
249 | "ground_truth": [{"role": "assistant", "content": "..."}]
250 | }
251 | "extra_info": {
252 | # 与 prompt 相同,用于兼容 VERL 与 POLAR
253 | "prompt": [{"role": "user", "content": "..."}, ...],
254 | }
255 | }
256 | ```
257 |
258 | ### 训练步骤
259 |
260 | - **Step 1:** 部署 POLAR
261 |
262 | 按照上述[推理](#推理)部分的说明,启动 POLAR 奖励模型服务,并在 `src/polar/reward_func.py` 中更新服务配置:
263 |
264 | ```python
265 | # 配置奖励模型服务
266 | ADDRESS = "your_server_ip:port" # 修改为实际的服务器地址
267 | SERVER_TYPE = "sglang" # 可选:"sglang", "vllm", "lmdeploy"
268 | MODEL_PATH = "internlm/POLAR-7B"
269 | ```
270 |
271 | - **Step 2:** 数据准备
272 |
273 | 将训练数据准备为 Parquet 格式,可使用提供的预处理脚本:
274 |
275 | ```bash
276 | # 示例:处理 HH-RLHF 数据集
277 | python examples/data_preprocess/full_hh_rlhf.py --local_dir ~/data/hh_rlhf
278 | ```
279 |
280 | - **Step 3:** 配置训练脚本
281 |
282 | 示例训练脚本可参考:`examples/ppo/qwen2_5-7b_hh-rlhf.sh`.
283 |
284 | - **Step 4:** 启动训练
285 |
286 | ```bash
287 | cd verl
288 | bash ../examples/ppo/qwen2_5-7b_hh-rlhf.sh
289 | ```
290 |
291 | ### 参考结果
292 |
293 | 这里展示了使用 POLAR-7B 对 Qwen3-8B 进行强化微调的结果,使用了我们提供的[官方配置](https://github.com/InternLM/POLAR/blob/main/examples/ppo/qwen3-8b_general.sh), 以及开源的 [AM-DeepSeek-R1-0528-Distilled](https://huggingface.co/datasets/a-m-team/AM-DeepSeek-R1-0528-Distilled) 数据集. 评估过程由 [OpenCompass](https://github.com/internLM/OpenCompass/) 完成。
294 |
295 | | 评测集 | Qwen3-8B 思考模式 | Qwen3-8B 思考模式 (RFT) |
296 | | --- | ---- | ---- |
297 | | alignment_bench | 7.04 | 7.48 |
298 | | alpaca_eval | 87.20 | 95.40 |
299 | | arenahard | 83.15 | 89.45 |
300 | | followbench | 0.93 | 0.95 |
301 | | mtbench | 8.73 | 8.78 |
302 | | wildbench | 58.43 | 72.09 |
303 | | mmlu | 86.06 | 86.58 |
304 | | mmlu_pro | 73.66 | 75.19 |
305 | | cmmlu | 82.72 | 83.07 |
306 | | bbeh | 29.56 | 33.30 |
307 | | korbench | 73.16 | 75.00 |
308 | | gpqa | 61.05 | 63.07 |
309 | | supergpqa | 47.82 | 49.67 |
310 | | olympiadbench | 69.90 | 70.45 |
311 | | aime2024 | 75.52 | 75.83 |
312 | | aime2025 | 67.50 | 68.71 |
313 | | mbpp | 83.66 | 93.00 |
314 | | lcb-code | 46.86 | 48.57 |
315 |
316 | ## 偏好微调
317 |
318 | 推荐使用最新的 [xtuner](https://github.com/InternLM/xtuner) 来微调 POLAR。xtuner 是一个高效、灵活、具有多种使用特性的语言模型微调工具。
319 |
320 | - 建议使用 conda 创建 Python-3.10 虚拟环境:
321 |
322 | ```bash
323 | conda create --name xtuner-env python=3.10 -y
324 | conda activate xtuner-env
325 | ```
326 |
327 | - 通过 pip 安装 xtuner:
328 |
329 | ```shell
330 | pip install 'xtuner[deepspeed]'==0.2.0
331 | ```
332 |
333 | - 通过最新源码安装 xtuner:
334 |
335 | ```shell
336 | pip install 'git+https://github.com/InternLM/xtuner.git@main#egg=xtuner[deepspeed]'
337 | ```
338 |
339 | ### 环境依赖
340 |
341 | - flash_attn
342 | - tensorboard
343 |
344 | ### 数据格式
345 |
346 | 与传统的奖励模型不同,除了 chosen 轨迹和 rejected 轨迹,POLAR 在微调过程中还需要一个额外的参考答案作为示范。你可以通过构建一个 `train.jsonl` 的文件来准备微调数据,格式如下:
347 |
348 | ```json
349 | {
350 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
351 | "reference": [{"role": "assistant", "content": "Beijing."}],
352 | "chosen": [{"role": "assistant", "content": "Beijing."}],
353 | "rejected": [{"role": "assistant", "content": "Shanghai."}]
354 | }
355 | ```
356 |
357 | ### 训练步骤
358 |
359 | - **第一步:** 准备配置文件。我们提供了可直接使用的[示例配置](./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py)。如果需要进一步对超参进行修改,请复制一份示例配置文件,并根据 [xtuner 使用指南](https://github.com/InternLM/xtuner/blob/main/docs/en/get_started/quickstart.md) 进行修改。有关奖励模型训练设置的更多信息,请参考 [xtuner 奖励模型](https://github.com/InternLM/xtuner/blob/main/docs/en/reward_model/modify_settings.md)。
360 |
361 | - **第二步:** 启动微调。
362 |
363 | ```shell
364 | xtuner train ${CONFIG_FILE_PATH}
365 | ```
366 |
367 | 例如,你可以按照如下的方式微调 POLAR-7B-Base:
368 | ```shell
369 | # On a single GPU
370 | xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2
371 |
372 | # On multiple GPUs
373 | NPROC_PER_NODE=${GPU_NUM} xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2
374 | ```
375 |
376 | 这里,`--deepspeed` 表示使用 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 来加速训练。xtuner 内置了多种 DeepSpeed 策略,包括 ZeRO-1、ZeRO-2 和 ZeRO-3。如果您想禁用此功能,只需移除此参数即可。
377 |
378 | - **第三步:** 将保存的 PTH 模型(若使用 DeepSpeed,则保存结果会是一个目录)转换为 HuggingFace 模型,命令如下:
379 |
380 | ```shell
381 | xtuner convert pth_to_hf ${CONFIG_FILE_PATH} ${PTH} ${SAVE_PATH}
382 | ```
383 |
384 |
385 | # 效果示例
386 |
387 | ## 客观问答
388 |
389 | ```python
390 | from src.polar import RewardModelClient
391 | # from xtuner.utils import RewardModelClient
392 |
393 | prompt = "单词“strawberry”中有几个“r”?"
394 | reference = "单词“strawberry”中包含3个字母“r”。我们可以逐字母数一下:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。因此,答案是3。"
395 | outputs = [
396 | # 与参考完全一致
397 | "单词“strawberry”中包含3个字母“r”。我们可以逐字母数一下:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。因此,答案是3。",
398 | # 思路正确,答案正确
399 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是三。",
400 | # 思路错误,答案错误
401 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是二。",
402 | # 思路错误,答案正确
403 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是三。",
404 | # 思路正确,答案错误
405 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是二。",
406 | # 答案正确
407 | "单词“strawberry”中有3个“r”",
408 | # 答案错误
409 | "单词“strawberry”中有2个“r”"
410 | ]
411 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs]
412 |
413 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000")
414 | rewards = client(data)
415 |
416 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True)
417 |
418 | for output, reward in sorted_res:
419 | print(f"Output: {output}\nReward: {reward}\n")
420 | ```
421 |
422 | ```txt
423 | Output: 单词“strawberry”中包含3个字母“r”。我们可以逐字母数一下:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。因此,答案是3。
424 | Reward: -1.5380859375
425 |
426 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是三。
427 | Reward: -2.767578125
428 |
429 | Output: 单词“strawberry”中有3个“r”
430 | Reward: -7.45703125
431 |
432 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是二。
433 | Reward: -7.6328125
434 |
435 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是三。
436 | Reward: -8.65625
437 |
438 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是二。
439 | Reward: -9.2890625
440 |
441 | Output: 单词“strawberry”中有2个“r”
442 | Reward: -11.921875
443 | ```
444 |
445 | ## 主观问答
446 | ```python
447 | from src.polar import RewardModelClient
448 | # from xtuner.utils import RewardModelClient
449 |
450 | prompt = "帮我想3个形容雨很大的成语,要求不能重复。"
451 | reference = "1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨"
452 | outputs = [
453 | # 与参考相同
454 | "1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨",
455 | # 正确回答
456 | "1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注",
457 | # 非成语
458 | "1. 急雨如瀑 2. 豪雨倾天 3. 雨势磅礴",
459 | # 与参考类似,多一个。
460 | "1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨 4. 大雨滂沱",
461 | # 与参考类似,重复一个。
462 | "1. 倾盆大雨 2. 暴雨如注 3. 暴雨如注",
463 | # 与参考类似,少一个。
464 | "1. 倾盆大雨 2. 暴雨如注",
465 | # 成语正确,多一个。
466 | "1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注 4. 倾盆大雨",
467 | # 成语正确,重复一个
468 | "1. 大雨滂沱 2. 狂风骤雨 3. 狂风骤雨",
469 | # 成语正确,少一个
470 | "1. 大雨滂沱 2. 狂风骤雨"
471 | ]
472 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs]
473 |
474 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000")
475 | rewards = client(data)
476 |
477 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True)
478 |
479 | for output, reward in sorted_res:
480 | print(f"Output: {output}\nReward: {reward}\n")
481 | ```
482 |
483 | ```txt
484 | Output: 1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨
485 | Reward: -1.42578125
486 |
487 | Output: 1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注
488 | Reward: -5.234375
489 |
490 | Output: 1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨 4. 大雨滂沱
491 | Reward: -5.62890625
492 |
493 | Output: 1. 急雨如瀑 2. 豪雨倾天 3. 雨势磅礴
494 | Reward: -5.7109375
495 |
496 | Output: 1. 倾盆大雨 2. 暴雨如注
497 | Reward: -6.61328125
498 |
499 | Output: 1. 倾盆大雨 2. 暴雨如注 3. 暴雨如注
500 | Reward: -6.65234375
501 |
502 | Output: 1. 大雨滂沱 2. 狂风骤雨
503 | Reward: -6.828125
504 |
505 | Output: 1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注 4. 倾盆大雨
506 | Reward: -7.0234375
507 |
508 | Output: 1. 大雨滂沱 2. 狂风骤雨 3. 狂风骤雨
509 | Reward: -7.23046875
510 | ```
511 |
512 | # 许可证
513 |
514 | 代码和模型权重均采用 Apache-2.0 许可证。
515 |
516 | # 引用
517 |
518 | ```
519 | @article{dou2025pretrained,
520 | title={Pre-Trained Policy Discriminators are General Reward Models},
521 | author={Dou, Shihan and Liu, Shichun and Yang, Yuming and Zou, Yicheng and Zhou, Yunhua and Xing, Shuhao and Huang, Chenhao and Ge, Qiming and Song, Demin and Lv, Haijun and others},
522 | journal={arXiv preprint arXiv:2507.05197},
523 | year={2025}
524 | }
525 | ```
526 |
--------------------------------------------------------------------------------
/assets/intro.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternLM/POLAR/80626f39de581ac56d7bf1ca36a5e4f83d42d5c5/assets/intro.jpeg
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternLM/POLAR/80626f39de581ac56d7bf1ca36a5e4f83d42d5c5/assets/logo.png
--------------------------------------------------------------------------------
/assets/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternLM/POLAR/80626f39de581ac56d7bf1ca36a5e4f83d42d5c5/assets/result.png
--------------------------------------------------------------------------------
/examples/data_preprocess/am_general.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 POLAR Team and/or its affiliates
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | import os
17 | import json
18 |
19 | from datasets import Dataset
20 | from huggingface_hub import snapshot_download
21 |
22 | filtered_num = 0
23 | data_source = "a-m-team/AM-DeepSeek-R1-0528-Distilled"
24 |
25 |
26 | def read_dataset(local_dir):
27 | global filtered_num
28 | idx = 0
29 | for file in os.listdir(local_dir):
30 | if file.endswith(".jsonl"):
31 | with open(os.path.join(local_dir, file), "r", encoding="utf-8") as f:
32 | for line in f:
33 | example = json.loads(line)
34 | try:
35 | conversations = example.pop("conversations")
36 |
37 | dialogs = []
38 | for item in conversations:
39 | if item["from"] == "human":
40 | dialogs.append({"role": "user", "content": item["value"]})
41 | else:
42 | if "info" in item:
43 | dialogs.append({"role": "assistant", "content": item["info"]["answer_content"]})
44 | else:
45 | content = item["value"].split("")[1].split("")[0].strip()
46 | dialogs.append({"role": "assistant", "content": content})
47 |
48 | assert dialogs[-1]["role"] == "assistant"
49 | data = {
50 | "data_source": data_source,
51 | "prompt": dialogs[:-1],
52 | "ability": "general",
53 | "reward_model": {
54 | "style": "polar",
55 | "ground_truth": dialogs[-1:],
56 | },
57 | "extra_info": {
58 | "split": "train",
59 | "index": idx,
60 | "ability": "general",
61 | "prompt": dialogs[:-1],
62 | },
63 | }
64 | yield data
65 | idx += 1
66 | except Exception as e:
67 | print(f"Error processing example {idx}: {e}")
68 | filtered_num += 1
69 |
70 |
71 | def generate_dataset(local_dir="~/data/general"):
72 |
73 | data_dir = snapshot_download(
74 | repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled",
75 | repo_type="dataset",
76 | revision="main",
77 | local_dir="~/data/AM-DeepSeek-R1-0528-Distilled",
78 | local_dir_use_symlinks=False,
79 | allow_patterns=["*.jsonl"],
80 | ignore_patterns=["*.png"]
81 | )
82 |
83 | final_dataset = Dataset.from_generator(lambda: read_dataset(data_dir))
84 | local_dir = os.path.expanduser(local_dir)
85 | local_path = os.path.join(local_dir, "train.parquet")
86 | final_dataset.shuffle(seed=42).to_parquet(local_path)
87 | print(f"Filtered {filtered_num} examples due to errors.")
88 |
89 |
90 | if __name__ == "__main__":
91 | parser = argparse.ArgumentParser()
92 | parser.add_argument("--local_dir", type=str, default="~/data/general")
93 | args = parser.parse_args()
94 |
95 | generate_dataset(args.local_dir)
96 |
--------------------------------------------------------------------------------
/examples/data_preprocess/full_hh_rlhf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 POLAR Team and/or its affiliates
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | import os
17 | import re
18 |
19 | from datasets import load_dataset
20 |
21 |
22 | def parse_dialogue(text: str):
23 | # Fixed pattern to correctly handle content with newlines
24 | # Uses \Z instead of $ for proper end-of-string matching
25 | pattern = r'^(Human|Assistant):\s*(.*?)(?=\n\n(?:Human|Assistant):|\Z)'
26 | matches = re.finditer(pattern, text, flags=re.MULTILINE | re.DOTALL)
27 | return [{"role": m.group(1).lower(), "content": m.group(2).strip()} for m in matches]
28 |
29 |
30 | def generate_dataset(local_dir="~/data/full_hh_rlhf"):
31 | dataset = load_dataset("Anthropic/hh-rlhf")
32 | train_dataset = dataset["train"]
33 |
34 | data_source = "Anthropic/hh-rlhf"
35 |
36 | # add a row to each data item that represents a unique id
37 | def make_map_fn(split):
38 | def process_fn(example, idx):
39 | chosen = example.pop("chosen")
40 | example.pop("rejected")
41 |
42 | dialogs = parse_dialogue(chosen)
43 | dialogs = [{"role": "user", "content": d["content"]} if d["role"] == "human" else d for d in dialogs]
44 |
45 | data = {
46 | "data_source": data_source,
47 | "prompt": dialogs[:-1],
48 | "ability": "alignment",
49 | "reward_model": {
50 | "style": "polar",
51 | "ground_truth": dialogs[-1:],
52 | },
53 | "extra_info": {
54 | "split": split,
55 | "index": idx,
56 | "ability": "alignment",
57 | "prompt": dialogs[:-1],
58 | },
59 | }
60 | return data
61 |
62 | return process_fn
63 |
64 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
65 | local_dir = os.path.expanduser(local_dir)
66 | local_path = os.path.join(local_dir, "train.parquet")
67 | train_dataset.to_parquet(local_path)
68 |
69 |
70 | if __name__ == "__main__":
71 | parser = argparse.ArgumentParser()
72 | parser.add_argument("--local_dir", type=str, default="~/data/full_hh_rlhf")
73 | args = parser.parse_args()
74 |
75 | generate_dataset(args.local_dir)
76 |
--------------------------------------------------------------------------------
/examples/data_preprocess/math.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 POLAR Team and/or its affiliates
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Preprocess the MATH-lighteval dataset to parquet format.
16 | Ground truths in the train split are kept as raw trajectories.
17 | Ground truths in the test split are extracted from \\boxed{}
18 | """
19 |
20 | import argparse
21 | import os
22 |
23 | import datasets
24 |
25 |
26 | def remove_boxed(s):
27 | if "\\boxed " in s:
28 | left = "\\boxed "
29 | assert s[: len(left)] == left
30 | return s[len(left):]
31 |
32 | left = "\\boxed{"
33 |
34 | assert s[: len(left)] == left
35 | assert s[-1] == "}"
36 |
37 | return s[len(left): -1]
38 |
39 |
40 | def last_boxed_only_string(string):
41 | idx = string.rfind("\\boxed")
42 | if "\\boxed " in string:
43 | return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
44 | if idx < 0:
45 | idx = string.rfind("\\fbox")
46 | if idx < 0:
47 | return None
48 |
49 | i = idx
50 | right_brace_idx = None
51 | num_left_braces_open = 0
52 | while i < len(string):
53 | if string[i] == "{":
54 | num_left_braces_open += 1
55 | if string[i] == "}":
56 | num_left_braces_open -= 1
57 | if num_left_braces_open == 0:
58 | right_brace_idx = i
59 | break
60 | i += 1
61 |
62 | retval = None if right_brace_idx is None else string[idx: right_brace_idx + 1]
63 |
64 | return retval
65 |
66 |
67 | def extract_solution(solution_str):
68 | return remove_boxed(last_boxed_only_string(solution_str))
69 |
70 |
71 | if __name__ == "__main__":
72 | parser = argparse.ArgumentParser()
73 | parser.add_argument("--local_dir", default="~/data/math")
74 |
75 | args = parser.parse_args()
76 |
77 | # 'lighteval/MATH' is no longer available on huggingface.
78 | # Use mirror repo: DigitalLearningGmbH/MATH-lighteval
79 | data_source = "DigitalLearningGmbH/MATH-lighteval"
80 | print(f"Loading the {data_source} dataset from huggingface...", flush=True)
81 | dataset = datasets.load_dataset(data_source, trust_remote_code=True)
82 |
83 | train_dataset = dataset["train"]
84 | test_dataset = dataset["test"]
85 |
86 | instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
87 |
88 | # add a row to each data item that represents a unique id
89 | def make_map_fn(split):
90 | def process_fn(example, idx):
91 | example.pop("level")
92 | example.pop("type")
93 | question = example.pop("problem")
94 |
95 | question = question + " " + instruction_following
96 |
97 | answer = example.pop("solution")
98 | if split == "train":
99 | solution = answer
100 | else:
101 | solution = extract_solution(answer)
102 |
103 | data = {
104 | "data_source": data_source,
105 | "prompt": [{"role": "user", "content": question}],
106 | "ability": "math",
107 | "reward_model": {"style": "polar", "ground_truth": solution},
108 | "extra_info": {"split": split,
109 | "index": idx,
110 | "ability": "math",
111 | "prompt": [{"role": "user", "content": question}]
112 | },
113 | }
114 | return data
115 |
116 | return process_fn
117 |
118 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
119 | test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
120 |
121 | local_dir = args.local_dir
122 |
123 | train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
124 | test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
125 |
--------------------------------------------------------------------------------
/examples/ppo/llama3-8b_general.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for llama3.1-8B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=1
7 | train_batch_size=1024
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=General
11 | policy_model_name=LLaMa3.1-8B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=meta-llama/Llama-3.1-8B-Instruct
16 | critic_path=meta-llama/Llama-3.1-8B-Instruct
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/general/train.parquet
20 | test_data_path=$HOME/data/general/train.parquet # no use
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=1024 \
66 | data.max_response_length=1024 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | \
89 | critic.model.path="$critic_path" \
90 | critic.model.enable_gradient_checkpointing=True \
91 | critic.model.use_remove_padding=True \
92 | critic.model.fsdp_config.param_offload=False \
93 | critic.model.fsdp_config.optimizer_offload=False \
94 | critic.optim.lr=$critic_lr \
95 | critic.optim.lr_warmup_steps_ratio=0 \
96 | critic.optim.warmup_style=cosine \
97 | critic.optim.min_lr_ratio=0.1 \
98 | critic.use_dynamic_bsz=False \
99 | critic.ppo_micro_batch_size_per_gpu=2 \
100 | \
101 | reward_model.enable=False \
102 | reward_model.reward_manager=batch \
103 | custom_reward_function.path=$reward_func_path \
104 | custom_reward_function.name=compute_score_batch \
105 | \
106 | trainer.n_gpus_per_node=8 \
107 | trainer.nnodes=$nodes \
108 | trainer.critic_warmup=0 \
109 | trainer.logger='["console","wandb"]' \
110 | trainer.project_name='verl_ppo_general' \
111 | trainer.val_before_train=False \
112 | trainer.experiment_name="$name" \
113 | trainer.save_freq=100 \
114 | trainer.total_epochs=1 \
115 | trainer.default_local_dir=$output_dir \
116 | \
117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
118 | $@
119 |
120 | else
121 | sleep 10
122 | MASTER_ADDR=$(cat "$TARGET_FILE")
123 |
124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
126 |
127 | sleep 60
128 | while true; do
129 | status=$(ray status 2>&1)
130 |
131 | if echo "$status" | grep -q "Active:"; then
132 | echo "Active nodes found. Sleeping for 10 min..."
133 | sleep 600
134 | else
135 | echo "No active nodes found. Exiting..."
136 | exit 0
137 | fi
138 | done
139 |
140 | fi
--------------------------------------------------------------------------------
/examples/ppo/llama3-8b_hh-rlhf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for llama3.1-8B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=1
7 | train_batch_size=512
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=HH-RLHF
11 | policy_model_name=LLaMa3.1-8B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=meta-llama/Llama-3.1-8B-Instruct
16 | critic_path=meta-llama/Llama-3.1-8B-Instruct
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/full_hh_rlhf/train.parquet
20 | test_data_path=$HOME/data/full_hh_rlhf/train.parquet # no use
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=128 \
66 | data.max_response_length=512 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | \
89 | critic.model.path="$critic_path" \
90 | critic.model.enable_gradient_checkpointing=True \
91 | critic.model.use_remove_padding=True \
92 | critic.model.fsdp_config.param_offload=False \
93 | critic.model.fsdp_config.optimizer_offload=False \
94 | critic.optim.lr=$critic_lr \
95 | critic.optim.lr_warmup_steps_ratio=0 \
96 | critic.optim.warmup_style=cosine \
97 | critic.optim.min_lr_ratio=0.1 \
98 | critic.use_dynamic_bsz=False \
99 | critic.ppo_micro_batch_size_per_gpu=2 \
100 | \
101 | reward_model.enable=False \
102 | reward_model.reward_manager=batch \
103 | custom_reward_function.path=$reward_func_path \
104 | custom_reward_function.name=compute_score_batch \
105 | \
106 | trainer.n_gpus_per_node=8 \
107 | trainer.nnodes=$nodes \
108 | trainer.critic_warmup=0 \
109 | trainer.logger='["console","wandb"]' \
110 | trainer.project_name='verl_ppo_hh-rlhf' \
111 | trainer.val_before_train=False \
112 | trainer.experiment_name="$name" \
113 | trainer.save_freq=100 \
114 | trainer.total_epochs=5 \
115 | trainer.default_local_dir=$output_dir \
116 | \
117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
118 | $@
119 |
120 | else
121 | sleep 10
122 | MASTER_ADDR=$(cat "$TARGET_FILE")
123 |
124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
126 |
127 | sleep 60
128 | while true; do
129 | status=$(ray status 2>&1)
130 |
131 | if echo "$status" | grep -q "Active:"; then
132 | echo "Active nodes found. Sleeping for 10 min..."
133 | sleep 600
134 | else
135 | echo "No active nodes found. Exiting..."
136 | exit 0
137 | fi
138 | done
139 |
140 | fi
--------------------------------------------------------------------------------
/examples/ppo/llama3-8b_math.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for llama3.1-8B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=1
7 | train_batch_size=1024
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=MATH
11 | policy_model_name=LLaMa3.1-8B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=meta-llama/Llama-3.1-8B-Instruct
16 | critic_path=meta-llama/Llama-3.1-8B-Instruct
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/math/train.parquet
20 | test_data_path=$HOME/data/math/test.parquet
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=1024 \
66 | data.max_response_length=1024 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | \
89 | critic.model.path="$critic_path" \
90 | critic.model.enable_gradient_checkpointing=True \
91 | critic.model.use_remove_padding=True \
92 | critic.model.fsdp_config.param_offload=False \
93 | critic.model.fsdp_config.optimizer_offload=False \
94 | critic.optim.lr=$critic_lr \
95 | critic.optim.lr_warmup_steps_ratio=0 \
96 | critic.optim.warmup_style=cosine \
97 | critic.optim.min_lr_ratio=0.1 \
98 | critic.use_dynamic_bsz=False \
99 | critic.ppo_micro_batch_size_per_gpu=2 \
100 | \
101 | reward_model.enable=False \
102 | reward_model.reward_manager=batch \
103 | custom_reward_function.path=$reward_func_path \
104 | custom_reward_function.name=compute_score_batch \
105 | \
106 | trainer.n_gpus_per_node=8 \
107 | trainer.nnodes=$nodes \
108 | trainer.critic_warmup=0 \
109 | trainer.logger='["console","wandb"]' \
110 | trainer.project_name='verl_ppo_math' \
111 | trainer.val_before_train=True \
112 | trainer.experiment_name="$name" \
113 | trainer.save_freq=100 \
114 | trainer.test_freq=5 \
115 | trainer.total_epochs=100 \
116 | trainer.default_local_dir=$output_dir \
117 | \
118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
119 | $@
120 |
121 | else
122 | sleep 10
123 | MASTER_ADDR=$(cat "$TARGET_FILE")
124 |
125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
127 |
128 | sleep 60
129 | while true; do
130 | status=$(ray status 2>&1)
131 |
132 | if echo "$status" | grep -q "Active:"; then
133 | echo "Active nodes found. Sleeping for 10 min..."
134 | sleep 600
135 | else
136 | echo "No active nodes found. Exiting..."
137 | exit 0
138 | fi
139 | done
140 |
141 | fi
--------------------------------------------------------------------------------
/examples/ppo/qwen2_5-7b_general.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for Qwen2.5-7B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=1
7 | train_batch_size=1024
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=General
11 | policy_model_name=Qwen2.5-7B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=Qwen/Qwen2.5-7B-Instruct
16 | critic_path=Qwen/Qwen2.5-7B-Instruct
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/general/train.parquet
20 | test_data_path=$HOME/data/general/train.parquet # no use
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=1024 \
66 | data.max_response_length=1024 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | \
89 | critic.model.path="$critic_path" \
90 | critic.model.enable_gradient_checkpointing=True \
91 | critic.model.use_remove_padding=True \
92 | critic.model.fsdp_config.param_offload=False \
93 | critic.model.fsdp_config.optimizer_offload=False \
94 | critic.optim.lr=$critic_lr \
95 | critic.optim.lr_warmup_steps_ratio=0 \
96 | critic.optim.warmup_style=cosine \
97 | critic.optim.min_lr_ratio=0.1 \
98 | critic.use_dynamic_bsz=False \
99 | critic.ppo_micro_batch_size_per_gpu=2 \
100 | \
101 | reward_model.enable=False \
102 | reward_model.reward_manager=batch \
103 | custom_reward_function.path=$reward_func_path \
104 | custom_reward_function.name=compute_score_batch \
105 | \
106 | trainer.n_gpus_per_node=8 \
107 | trainer.nnodes=$nodes \
108 | trainer.critic_warmup=0 \
109 | trainer.logger='["console","wandb"]' \
110 | trainer.project_name='verl_ppo_general' \
111 | trainer.val_before_train=False \
112 | trainer.experiment_name="$name" \
113 | trainer.save_freq=100 \
114 | trainer.total_epochs=1 \
115 | trainer.default_local_dir=$output_dir \
116 | \
117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
118 | $@
119 |
120 | else
121 | sleep 10
122 | MASTER_ADDR=$(cat "$TARGET_FILE")
123 |
124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
126 |
127 | sleep 60
128 | while true; do
129 | status=$(ray status 2>&1)
130 |
131 | if echo "$status" | grep -q "Active:"; then
132 | echo "Active nodes found. Sleeping for 10 min..."
133 | sleep 600
134 | else
135 | echo "No active nodes found. Exiting..."
136 | exit 0
137 | fi
138 | done
139 |
140 | fi
--------------------------------------------------------------------------------
/examples/ppo/qwen2_5-7b_hh-rlhf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for Qwen2.5-7B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=1
7 | train_batch_size=512
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=HH-RLHF
11 | policy_model_name=Qwen2.5-7B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=Qwen/Qwen2.5-7B-Instruct
16 | critic_path=Qwen/Qwen2.5-7B-Instruct
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/full_hh_rlhf/train.parquet
20 | test_data_path=$HOME/data/full_hh_rlhf/train.parquet # no use
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=128 \
66 | data.max_response_length=512 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | \
89 | critic.model.path="$critic_path" \
90 | critic.model.enable_gradient_checkpointing=True \
91 | critic.model.use_remove_padding=True \
92 | critic.model.fsdp_config.param_offload=False \
93 | critic.model.fsdp_config.optimizer_offload=False \
94 | critic.optim.lr=$critic_lr \
95 | critic.optim.lr_warmup_steps_ratio=0 \
96 | critic.optim.warmup_style=cosine \
97 | critic.optim.min_lr_ratio=0.1 \
98 | critic.use_dynamic_bsz=False \
99 | critic.ppo_micro_batch_size_per_gpu=2 \
100 | \
101 | reward_model.enable=False \
102 | reward_model.reward_manager=batch \
103 | custom_reward_function.path=$reward_func_path \
104 | custom_reward_function.name=compute_score_batch \
105 | \
106 | trainer.n_gpus_per_node=8 \
107 | trainer.nnodes=$nodes \
108 | trainer.critic_warmup=0 \
109 | trainer.logger='["console","wandb"]' \
110 | trainer.project_name='verl_ppo_hh-rlhf' \
111 | trainer.val_before_train=False \
112 | trainer.experiment_name="$name" \
113 | trainer.save_freq=100 \
114 | trainer.total_epochs=5 \
115 | trainer.default_local_dir=$output_dir \
116 | \
117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
118 | $@
119 |
120 | else
121 | sleep 10
122 | MASTER_ADDR=$(cat "$TARGET_FILE")
123 |
124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
126 |
127 | sleep 60
128 | while true; do
129 | status=$(ray status 2>&1)
130 |
131 | if echo "$status" | grep -q "Active:"; then
132 | echo "Active nodes found. Sleeping for 10 min..."
133 | sleep 600
134 | else
135 | echo "No active nodes found. Exiting..."
136 | exit 0
137 | fi
138 | done
139 |
140 | fi
--------------------------------------------------------------------------------
/examples/ppo/qwen2_5-7b_math.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for Qwen2.5-7B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=1
7 | train_batch_size=1024
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=MATH
11 | policy_model_name=Qwen2.5-7B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=Qwen/Qwen2.5-7B-Instruct
16 | critic_path=Qwen/Qwen2.5-7B-Instruct
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/math/train.parquet
20 | test_data_path=$HOME/data/math/test.parquet
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=1024 \
66 | data.max_response_length=1024 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | \
89 | critic.model.path="$critic_path" \
90 | critic.model.enable_gradient_checkpointing=True \
91 | critic.model.use_remove_padding=True \
92 | critic.model.fsdp_config.param_offload=False \
93 | critic.model.fsdp_config.optimizer_offload=False \
94 | critic.optim.lr=$critic_lr \
95 | critic.optim.lr_warmup_steps_ratio=0 \
96 | critic.optim.warmup_style=cosine \
97 | critic.optim.min_lr_ratio=0.1 \
98 | critic.use_dynamic_bsz=False \
99 | critic.ppo_micro_batch_size_per_gpu=2 \
100 | \
101 | reward_model.enable=False \
102 | reward_model.reward_manager=batch \
103 | custom_reward_function.path=$reward_func_path \
104 | custom_reward_function.name=compute_score_batch \
105 | \
106 | trainer.n_gpus_per_node=8 \
107 | trainer.nnodes=$nodes \
108 | trainer.critic_warmup=0 \
109 | trainer.logger='["console","wandb"]' \
110 | trainer.project_name='verl_ppo_math' \
111 | trainer.val_before_train=True \
112 | trainer.experiment_name="$name" \
113 | trainer.save_freq=100 \
114 | trainer.test_freq=5 \
115 | trainer.total_epochs=100 \
116 | trainer.default_local_dir=$output_dir \
117 | \
118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
119 | $@
120 |
121 | else
122 | sleep 10
123 | MASTER_ADDR=$(cat "$TARGET_FILE")
124 |
125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
127 |
128 | sleep 60
129 | while true; do
130 | status=$(ray status 2>&1)
131 |
132 | if echo "$status" | grep -q "Active:"; then
133 | echo "Active nodes found. Sleeping for 10 min..."
134 | sleep 600
135 | else
136 | echo "No active nodes found. Exiting..."
137 | exit 0
138 | fi
139 | done
140 |
141 | fi
--------------------------------------------------------------------------------
/examples/ppo/qwen3-8b_general.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for Qwen3-8B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=4
7 | train_batch_size=1024
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=General
11 | policy_model_name=Qwen3-8B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=Qwen/Qwen3-8B
16 | critic_path=Qwen/Qwen3-8B
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/general/train.parquet
20 | test_data_path=$HOME/data/general/train.parquet # no use
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=1024 \
66 | data.max_response_length=15000 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
89 | \
90 | critic.model.path="$critic_path" \
91 | critic.model.enable_gradient_checkpointing=True \
92 | critic.model.use_remove_padding=True \
93 | critic.model.fsdp_config.param_offload=False \
94 | critic.model.fsdp_config.optimizer_offload=False \
95 | critic.optim.lr=$critic_lr \
96 | critic.optim.lr_warmup_steps_ratio=0 \
97 | critic.optim.warmup_style=cosine \
98 | critic.optim.min_lr_ratio=0.1 \
99 | critic.use_dynamic_bsz=False \
100 | critic.ppo_micro_batch_size_per_gpu=1 \
101 | \
102 | reward_model.enable=False \
103 | reward_model.reward_manager=batch \
104 | custom_reward_function.path=$reward_func_path \
105 | custom_reward_function.name=compute_score_batch \
106 | \
107 | trainer.n_gpus_per_node=8 \
108 | trainer.nnodes=$nodes \
109 | trainer.critic_warmup=0 \
110 | trainer.logger='["console","wandb"]' \
111 | trainer.project_name='verl_ppo_general' \
112 | trainer.val_before_train=False \
113 | trainer.experiment_name="$name" \
114 | trainer.save_freq=100 \
115 | trainer.total_epochs=1 \
116 | trainer.default_local_dir=$output_dir \
117 | \
118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
119 | $@
120 |
121 | else
122 | sleep 10
123 | MASTER_ADDR=$(cat "$TARGET_FILE")
124 |
125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
127 |
128 | sleep 60
129 | while true; do
130 | status=$(ray status 2>&1)
131 |
132 | if echo "$status" | grep -q "Active:"; then
133 | echo "Active nodes found. Sleeping for 10 min..."
134 | sleep 600
135 | else
136 | echo "No active nodes found. Exiting..."
137 | exit 0
138 | fi
139 | done
140 |
141 | fi
--------------------------------------------------------------------------------
/examples/ppo/qwen3-8b_hh-rlhf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for Qwen3-8B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=2
7 | train_batch_size=512
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=HH-RLHF
11 | policy_model_name=Qwen3-8B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=Qwen/Qwen3-8B
16 | critic_path=Qwen/Qwen3-8B
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/full_hh_rlhf/train.parquet
20 | test_data_path=$HOME/data/full_hh_rlhf/train.parquet # no use
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=128 \
66 | data.max_response_length=16000 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
89 | \
90 | critic.model.path="$critic_path" \
91 | critic.model.enable_gradient_checkpointing=True \
92 | critic.model.use_remove_padding=True \
93 | critic.model.fsdp_config.param_offload=False \
94 | critic.model.fsdp_config.optimizer_offload=False \
95 | critic.optim.lr=$critic_lr \
96 | critic.optim.lr_warmup_steps_ratio=0 \
97 | critic.optim.warmup_style=cosine \
98 | critic.optim.min_lr_ratio=0.1 \
99 | critic.use_dynamic_bsz=False \
100 | critic.ppo_micro_batch_size_per_gpu=1 \
101 | \
102 | reward_model.enable=False \
103 | reward_model.reward_manager=batch \
104 | custom_reward_function.path=$reward_func_path \
105 | custom_reward_function.name=compute_score_batch \
106 | \
107 | trainer.n_gpus_per_node=8 \
108 | trainer.nnodes=$nodes \
109 | trainer.critic_warmup=0 \
110 | trainer.logger='["console","wandb"]' \
111 | trainer.project_name='verl_ppo_hh-rlhf' \
112 | trainer.val_before_train=False \
113 | trainer.experiment_name="$name" \
114 | trainer.save_freq=100 \
115 | trainer.total_epochs=5 \
116 | trainer.default_local_dir=$output_dir \
117 | \
118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
119 | $@
120 |
121 | else
122 | sleep 10
123 | MASTER_ADDR=$(cat "$TARGET_FILE")
124 |
125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
127 |
128 | sleep 60
129 | while true; do
130 | status=$(ray status 2>&1)
131 |
132 | if echo "$status" | grep -q "Active:"; then
133 | echo "Active nodes found. Sleeping for 10 min..."
134 | sleep 600
135 | else
136 | echo "No active nodes found. Exiting..."
137 | exit 0
138 | fi
139 | done
140 |
141 | fi
--------------------------------------------------------------------------------
/examples/ppo/qwen3-8b_math.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Verl PPO training script for Qwen3-8B
3 | set -x
4 |
5 | # Parameters from original script
6 | nodes=4
7 | train_batch_size=1024
8 | actor_lr=1e-6
9 | critic_lr=1e-5
10 | data_name=MATH
11 | policy_model_name=Qwen3-8B-Instruct
12 | reward_model_name=POLAR-7B
13 |
14 | # Model paths
15 | actor_path=Qwen/Qwen3-8B
16 | critic_path=Qwen/Qwen3-8B
17 |
18 | # Data paths
19 | train_data_path=$HOME/data/math/train.parquet
20 | test_data_path=$HOME/data/math/test.parquet
21 |
22 | # Reward Configuration
23 | reward_func_path="../src/polar/reward_func.py"
24 |
25 | # Experiment name
26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}"
27 | output_dir="../outputs/${name}"
28 |
29 | # Create output directory if it doesn't exist
30 | mkdir -p $output_dir
31 |
32 | # Set wandb to offline mode to prevent online sync
33 | # export WANDB_MODE=offline
34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
35 |
36 | TARGET_FILE="$output_dir/addr_${name}.txt"
37 | RANK=${RANK:-${NODE_RANK:-0}}
38 | MASTER_PORT=6379
39 | MASTER_ADDR=${MASTER_ADDR}
40 | echo "MASTER_ADDR: $MASTER_ADDR"
41 | echo "Rank $RANK is running on $MASTER_ADDR"
42 |
43 | if [ "$RANK" -eq 0 ]; then
44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..."
45 |
46 | MASTER_ADDR=${MASTER_ADDR}
47 | echo "$MASTER_ADDR" > "$TARGET_FILE"
48 |
49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block &
50 | sleep 30
51 |
52 | echo "Executing main program on head node..."
53 |
54 | python3 -m verl.trainer.main_ppo \
55 | algorithm.adv_estimator=gae \
56 | algorithm.gamma=1.0 \
57 | algorithm.lam=1.0 \
58 | algorithm.use_kl_in_reward=False \
59 | algorithm.kl_ctrl.kl_coef=0 \
60 | algorithm.kl_ctrl.type='adaptive' \
61 | \
62 | data.train_files="$train_data_path" \
63 | data.val_files="$test_data_path" \
64 | data.train_batch_size=$train_batch_size \
65 | data.max_prompt_length=1024 \
66 | data.max_response_length=15000 \
67 | data.filter_overlong_prompts=True \
68 | data.truncation='error' \
69 | data.prompt_key='prompt' \
70 | \
71 | actor_rollout_ref.model.path="$actor_path" \
72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \
73 | actor_rollout_ref.model.use_remove_padding=True \
74 | actor_rollout_ref.model.use_shm=False \
75 | \
76 | actor_rollout_ref.actor.optim.lr=$actor_lr \
77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \
78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \
79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
80 | actor_rollout_ref.actor.clip_ratio=0.2 \
81 | actor_rollout_ref.actor.use_kl_loss=False \
82 | \
83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84 | actor_rollout_ref.rollout.n=1 \
85 | actor_rollout_ref.rollout.name=vllm \
86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
88 | actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
89 | \
90 | critic.model.path="$critic_path" \
91 | critic.model.enable_gradient_checkpointing=True \
92 | critic.model.use_remove_padding=True \
93 | critic.model.fsdp_config.param_offload=False \
94 | critic.model.fsdp_config.optimizer_offload=False \
95 | critic.optim.lr=$critic_lr \
96 | critic.optim.lr_warmup_steps_ratio=0 \
97 | critic.optim.warmup_style=cosine \
98 | critic.optim.min_lr_ratio=0.1 \
99 | critic.use_dynamic_bsz=False \
100 | critic.ppo_micro_batch_size_per_gpu=1 \
101 | \
102 | reward_model.enable=False \
103 | reward_model.reward_manager=batch \
104 | custom_reward_function.path=$reward_func_path \
105 | custom_reward_function.name=compute_score_batch \
106 | \
107 | trainer.n_gpus_per_node=8 \
108 | trainer.nnodes=$nodes \
109 | trainer.critic_warmup=0 \
110 | trainer.logger='["console","wandb"]' \
111 | trainer.project_name='verl_ppo_math' \
112 | trainer.val_before_train=True \
113 | trainer.experiment_name="$name" \
114 | trainer.save_freq=100 \
115 | trainer.test_freq=5 \
116 | trainer.total_epochs=100 \
117 | trainer.default_local_dir=$output_dir \
118 | \
119 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \
120 | $@
121 |
122 | else
123 | sleep 10
124 | MASTER_ADDR=$(cat "$TARGET_FILE")
125 |
126 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..."
127 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block &
128 |
129 | sleep 60
130 | while true; do
131 | status=$(ray status 2>&1)
132 |
133 | if echo "$status" | grep -q "Active:"; then
134 | echo "Active nodes found. Sleeping for 10 min..."
135 | sleep 600
136 | else
137 | echo "No active nodes found. Exiting..."
138 | exit 0
139 | fi
140 | done
141 |
142 | fi
--------------------------------------------------------------------------------
/examples/xtuner_configs/POLAR_1_8B_full_varlenattn_custom_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.dataset import DefaultSampler
3 | from mmengine.hooks import (
4 | CheckpointHook,
5 | DistSamplerSeedHook,
6 | IterTimerHook,
7 | LoggerHook,
8 | ParamSchedulerHook,
9 | )
10 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
11 | from mmengine.visualization import Visualizer, TensorboardVisBackend
12 | from torch.optim import AdamW
13 | from transformers import AutoModel, AutoTokenizer
14 |
15 | from datasets import load_dataset
16 | from xtuner.dataset.collate_fns.preference_collate_fn import preference_collate_fn
17 | from xtuner.dataset.preference_dataset import build_preference_dataset
18 | from xtuner.engine.hooks import VarlenAttnArgsToMessageHubHook
19 | from xtuner.engine.runner import TrainLoop
20 | from xtuner.model.reward import RewardModel
21 | from xtuner.parallel.sequence import SequenceParallelSampler
22 |
23 | #######################################################################
24 | # PART 1 Settings #
25 | #######################################################################
26 | # Model
27 | pretrained_model_name_or_path = "internlm/POLAR-1_8B-Base"
28 | use_varlen_attn = True
29 | reward_token_id = 92527 # use [UNUSED_TOKEN_130] as reward token
30 | loss_type = "ranking"
31 | penalty_type = "none"
32 |
33 | # Data
34 | max_length = 16384
35 | max_response_length = 4096
36 | max_packed_length = max_length * 2
37 |
38 | # parallel
39 | sequence_parallel_size = 1
40 |
41 | # Scheduler & Optimizer
42 | batch_size = 1 # per_device
43 | accumulative_counts = 2
44 | accumulative_counts *= sequence_parallel_size
45 | dataloader_num_workers = 0
46 | max_epochs = 1 # reward model should not be trained for more than 1 epoch to avoid overfitting # noqa: E501
47 | optim_type = AdamW
48 | lr = 1e-5
49 | betas = (0.9, 0.95)
50 | weight_decay = 0
51 | max_norm = 1 # grad clip
52 | warmup_ratio = 0.03
53 |
54 | # Save
55 | save_steps = 500
56 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
57 |
58 | # Evaluate the generation performance during the training
59 | # TODO: eval
60 | # evaluation_freq = 500
61 |
62 | #######################################################################
63 | # PART 2 Model & Tokenizer #
64 | #######################################################################
65 | tokenizer = dict(
66 | type=AutoTokenizer.from_pretrained,
67 | pretrained_model_name_or_path=pretrained_model_name_or_path,
68 | trust_remote_code=True,
69 | padding_side="left",
70 | )
71 |
72 | model = dict(
73 | type=RewardModel,
74 | use_varlen_attn=use_varlen_attn,
75 | loss_type=loss_type,
76 | penalty_type=penalty_type,
77 | llm=dict(
78 | type=AutoModel.from_pretrained,
79 | pretrained_model_name_or_path=pretrained_model_name_or_path,
80 | trust_remote_code=True,
81 | ),
82 | )
83 |
84 | #######################################################################
85 | # PART 3 Dataset & Dataloader #
86 | #######################################################################
87 | sampler = SequenceParallelSampler if sequence_parallel_size > 1 else DefaultSampler
88 |
89 | # preference data format example:
90 | # {
91 | # "prompt": [{"role": "user", "content": "What is the capital of France?"}],
92 | # "reference": [{"role": "assistant", "content": "The capital of France is Paris."}],
93 | # "chosen": [{"role": "assistant", "content": "Paris."}],
94 | # "rejected": [{"role": "assistant", "content": "I don't know."}],
95 | # }
96 |
97 | train_dataset = dict(
98 | type=build_preference_dataset,
99 | dataset=dict(
100 | type=load_dataset,
101 | # Replace with your custom dataset path
102 | # For example, if you have a local /path/to/file/train.jsonl, you can use:
103 | # path="/path/to/file",
104 | path="/your/custom/path/here",
105 | ),
106 | tokenizer=tokenizer,
107 | max_length=max_length,
108 | dataset_map_fn=None,
109 | is_dpo=False,
110 | is_reward=True,
111 | reward_token_id=reward_token_id,
112 | num_proc=32,
113 | use_varlen_attn=use_varlen_attn,
114 | max_packed_length=max_packed_length,
115 | shuffle_before_pack=True,
116 | max_response_length=max_response_length,
117 | is_reference=True
118 | )
119 |
120 | train_dataloader = dict(
121 | batch_size=batch_size,
122 | num_workers=dataloader_num_workers,
123 | dataset=train_dataset,
124 | sampler=dict(type=sampler, shuffle=True),
125 | collate_fn=dict(type=preference_collate_fn, use_varlen_attn=use_varlen_attn),
126 | )
127 |
128 | #######################################################################
129 | # PART 4 Scheduler & Optimizer #
130 | #######################################################################
131 | # optimizer
132 | optim_wrapper = dict(
133 | type=AmpOptimWrapper,
134 | optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
135 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
136 | accumulative_counts=accumulative_counts,
137 | loss_scale="dynamic",
138 | dtype="float16",
139 | )
140 |
141 | # learning policy
142 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
143 | param_scheduler = [
144 | dict(
145 | type=LinearLR,
146 | start_factor=lr * 0.1,
147 | by_epoch=True,
148 | begin=0,
149 | end=warmup_ratio * max_epochs,
150 | convert_to_iter_based=True,
151 | ),
152 | dict(
153 | type=CosineAnnealingLR,
154 | eta_min=lr * 0.1,
155 | by_epoch=True,
156 | begin=warmup_ratio * max_epochs,
157 | end=max_epochs,
158 | convert_to_iter_based=True,
159 | ),
160 | ]
161 |
162 | # train, val, test setting
163 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
164 |
165 | #######################################################################
166 | # PART 5 Runtime #
167 | #######################################################################
168 | # Log the dialogue periodically during the training process, optional
169 | custom_hooks = []
170 |
171 | if use_varlen_attn:
172 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
173 |
174 | # configure default hooks
175 | default_hooks = dict(
176 | # record the time of every iteration.
177 | timer=dict(type=IterTimerHook),
178 | # print log every 10 iterations.
179 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
180 | # enable the parameter scheduler.
181 | param_scheduler=dict(type=ParamSchedulerHook),
182 | # save checkpoint per `save_steps`.
183 | checkpoint=dict(
184 | type=CheckpointHook,
185 | by_epoch=False,
186 | interval=save_steps,
187 | max_keep_ckpts=save_total_limit,
188 | ),
189 | # set sampler seed in distributed evrionment.
190 | sampler_seed=dict(type=DistSamplerSeedHook),
191 | )
192 |
193 | # configure environment
194 | env_cfg = dict(
195 | # whether to enable cudnn benchmark
196 | cudnn_benchmark=False,
197 | # set multi process parameters
198 | mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
199 | # set distributed parameters
200 | dist_cfg=dict(backend="nccl"),
201 | )
202 |
203 | # set visualizer
204 | visualizer = dict(
205 | type=Visualizer,
206 | vis_backends=[dict(type=TensorboardVisBackend)]
207 | )
208 |
209 | # set log level
210 | log_level = "INFO"
211 |
212 | # load from which checkpoint
213 | load_from = None
214 |
215 | # whether to resume training from the loaded checkpoint
216 | resume = False
217 |
218 | # Defaults to use random seed and disable `deterministic`
219 | randomness = dict(seed=None, deterministic=False)
220 |
221 | # set log processor
222 | log_processor = dict(by_epoch=False)
223 |
--------------------------------------------------------------------------------
/examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.dataset import DefaultSampler
3 | from mmengine.hooks import (
4 | CheckpointHook,
5 | DistSamplerSeedHook,
6 | IterTimerHook,
7 | LoggerHook,
8 | ParamSchedulerHook,
9 | )
10 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
11 | from mmengine.visualization import Visualizer, TensorboardVisBackend
12 | from torch.optim import AdamW
13 | from transformers import AutoModel, AutoTokenizer
14 |
15 | from datasets import load_dataset
16 | from xtuner.dataset.collate_fns.preference_collate_fn import preference_collate_fn
17 | from xtuner.dataset.preference_dataset import build_preference_dataset
18 | from xtuner.engine.hooks import VarlenAttnArgsToMessageHubHook
19 | from xtuner.engine.runner import TrainLoop
20 | from xtuner.model.reward import RewardModel
21 | from xtuner.parallel.sequence import SequenceParallelSampler
22 |
23 | #######################################################################
24 | # PART 1 Settings #
25 | #######################################################################
26 | # Model
27 | pretrained_model_name_or_path = "internlm/POLAR-7B-Base"
28 | use_varlen_attn = True
29 | reward_token_id = 92527 # use [UNUSED_TOKEN_130] as reward token
30 | loss_type = "ranking"
31 | penalty_type = "none"
32 |
33 | # Data
34 | max_length = 16384
35 | max_response_length = 4096
36 | max_packed_length = max_length * 2
37 |
38 | # parallel
39 | sequence_parallel_size = 1
40 |
41 | # Scheduler & Optimizer
42 | batch_size = 1 # per_device
43 | accumulative_counts = 2
44 | accumulative_counts *= sequence_parallel_size
45 | dataloader_num_workers = 0
46 | max_epochs = 1 # reward model should not be trained for more than 1 epoch to avoid overfitting # noqa: E501
47 | optim_type = AdamW
48 | lr = 2e-5
49 | betas = (0.9, 0.95)
50 | weight_decay = 0
51 | max_norm = 1 # grad clip
52 | warmup_ratio = 0.03
53 |
54 | # Save
55 | save_steps = 500
56 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
57 |
58 | # Evaluate the generation performance during the training
59 | # TODO: eval
60 | # evaluation_freq = 500
61 |
62 | #######################################################################
63 | # PART 2 Model & Tokenizer #
64 | #######################################################################
65 | tokenizer = dict(
66 | type=AutoTokenizer.from_pretrained,
67 | pretrained_model_name_or_path=pretrained_model_name_or_path,
68 | trust_remote_code=True,
69 | padding_side="left",
70 | )
71 |
72 | model = dict(
73 | type=RewardModel,
74 | use_varlen_attn=use_varlen_attn,
75 | loss_type=loss_type,
76 | penalty_type=penalty_type,
77 | llm=dict(
78 | type=AutoModel.from_pretrained,
79 | pretrained_model_name_or_path=pretrained_model_name_or_path,
80 | trust_remote_code=True,
81 | ),
82 | )
83 |
84 | #######################################################################
85 | # PART 3 Dataset & Dataloader #
86 | #######################################################################
87 | sampler = SequenceParallelSampler if sequence_parallel_size > 1 else DefaultSampler
88 |
89 | # preference data format example:
90 | # {
91 | # "prompt": [{"role": "user", "content": "What is the capital of France?"}],
92 | # "reference": [{"role": "assistant", "content": "The capital of France is Paris."}],
93 | # "chosen": [{"role": "assistant", "content": "Paris."}],
94 | # "rejected": [{"role": "assistant", "content": "I don't know."}],
95 | # }
96 |
97 | train_dataset = dict(
98 | type=build_preference_dataset,
99 | dataset=dict(
100 | type=load_dataset,
101 | # Replace with your custom dataset path
102 | # For example, if you have a local /path/to/file/train.jsonl, you can use:
103 | # path="/path/to/file",
104 | path="/your/custom/path/here",
105 | ),
106 | tokenizer=tokenizer,
107 | max_length=max_length,
108 | dataset_map_fn=None,
109 | is_dpo=False,
110 | is_reward=True,
111 | reward_token_id=reward_token_id,
112 | num_proc=32,
113 | use_varlen_attn=use_varlen_attn,
114 | max_packed_length=max_packed_length,
115 | shuffle_before_pack=True,
116 | max_response_length=max_response_length,
117 | is_reference=True
118 | )
119 |
120 | train_dataloader = dict(
121 | batch_size=batch_size,
122 | num_workers=dataloader_num_workers,
123 | dataset=train_dataset,
124 | sampler=dict(type=sampler, shuffle=True),
125 | collate_fn=dict(type=preference_collate_fn, use_varlen_attn=use_varlen_attn),
126 | )
127 |
128 | #######################################################################
129 | # PART 4 Scheduler & Optimizer #
130 | #######################################################################
131 | # optimizer
132 | optim_wrapper = dict(
133 | type=AmpOptimWrapper,
134 | optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
135 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
136 | accumulative_counts=accumulative_counts,
137 | loss_scale="dynamic",
138 | dtype="float16",
139 | )
140 |
141 | # learning policy
142 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
143 | param_scheduler = [
144 | dict(
145 | type=LinearLR,
146 | start_factor=lr * 0.1,
147 | by_epoch=True,
148 | begin=0,
149 | end=warmup_ratio * max_epochs,
150 | convert_to_iter_based=True,
151 | ),
152 | dict(
153 | type=CosineAnnealingLR,
154 | eta_min=lr * 0.1,
155 | by_epoch=True,
156 | begin=warmup_ratio * max_epochs,
157 | end=max_epochs,
158 | convert_to_iter_based=True,
159 | ),
160 | ]
161 |
162 | # train, val, test setting
163 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
164 |
165 | #######################################################################
166 | # PART 5 Runtime #
167 | #######################################################################
168 | # Log the dialogue periodically during the training process, optional
169 | custom_hooks = []
170 |
171 | if use_varlen_attn:
172 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
173 |
174 | # configure default hooks
175 | default_hooks = dict(
176 | # record the time of every iteration.
177 | timer=dict(type=IterTimerHook),
178 | # print log every 10 iterations.
179 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
180 | # enable the parameter scheduler.
181 | param_scheduler=dict(type=ParamSchedulerHook),
182 | # save checkpoint per `save_steps`.
183 | checkpoint=dict(
184 | type=CheckpointHook,
185 | by_epoch=False,
186 | interval=save_steps,
187 | max_keep_ckpts=save_total_limit,
188 | ),
189 | # set sampler seed in distributed evrionment.
190 | sampler_seed=dict(type=DistSamplerSeedHook),
191 | )
192 |
193 | # configure environment
194 | env_cfg = dict(
195 | # whether to enable cudnn benchmark
196 | cudnn_benchmark=False,
197 | # set multi process parameters
198 | mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
199 | # set distributed parameters
200 | dist_cfg=dict(backend="nccl"),
201 | )
202 |
203 | # set visualizer
204 | visualizer = dict(
205 | type=Visualizer,
206 | vis_backends=[dict(type=TensorboardVisBackend)]
207 | )
208 |
209 | # set log level
210 | log_level = "INFO"
211 |
212 | # load from which checkpoint
213 | load_from = None
214 |
215 | # whether to resume training from the loaded checkpoint
216 | resume = False
217 |
218 | # Defaults to use random seed and disable `deterministic`
219 | randomness = dict(seed=None, deterministic=False)
220 |
221 | # set log processor
222 | log_processor = dict(by_epoch=False)
223 |
--------------------------------------------------------------------------------
/src/polar/__init__.py:
--------------------------------------------------------------------------------
1 | from .reward_func import RewardModelClient
2 |
--------------------------------------------------------------------------------
/src/polar/reward_func.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 POLAR Team and/or its affiliates
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List, Union
16 | from time import sleep
17 | import requests
18 | import re
19 | from transformers import AutoTokenizer
20 |
21 | # Config reward model server
22 | ADDRESS = "127.0.0.1:30000" # Modify according to your server address
23 | SERVER_TYPE = "sglang" # Options: "sglang", "vllm", "lmdeploy"
24 | MODEL_PATH = "internlm/POLAR-7B"
25 |
26 |
27 | class RewardModelClient:
28 | """This class is used to process the input sequences for the reward
29 | model."""
30 |
31 | def __init__(
32 | self,
33 | path,
34 | max_length=16384,
35 | max_response_length=4096,
36 | response_cut_side="right",
37 | server_type="sglang",
38 | server_address="127.0.0.1:30000",
39 | ):
40 | """
41 | Args:
42 | path: Path to the reward model.
43 | max_length: Maximum length of the input sequence.
44 | max_response_length: Maximum length of the response sequence.
45 | response_cut_side: Side to cut the response sequence if it exceeds the maximum length.
46 | server_type: Type of the server, can be "sglang", "vllm", or "lmdeploy".
47 | server_address: Address of the reword model server.
48 | """
49 | self.rm_name = path.split("/")[-1]
50 | self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
51 | # for final reward token and one <|reward|> token and two '\n' tokens
52 | self.max_length = max_length - 4
53 | self.max_response_length = max_response_length
54 | self.response_cut_side = response_cut_side
55 | self.server_type = server_type
56 | self.server_address = server_address
57 |
58 | def _encode(self, prompt, reference, output, wrapper="sft") -> str:
59 | """Construct the input string for the reward model.
60 |
61 | Args:
62 | prompt: Prompt.
63 | reference: Reference trajectory.
64 | output: Candidate trajectory.
65 | wrapper: The wrapper type. Can be "sft" or "pretrain".
66 | Returns:
67 | The constructed input string for RM.
68 | """
69 | p = (
70 | "\n".join([e["content"] for e in prompt])
71 | if isinstance(prompt, list)
72 | else prompt
73 | )
74 | r1 = (
75 | "\n".join([e["content"] for e in reference])
76 | if isinstance(reference, list)
77 | else reference
78 | )
79 | r2 = (
80 | "\n".join([e["content"] for e in output])
81 | if isinstance(output, list)
82 | else output
83 | )
84 |
85 | p_ids = self.tokenizer.encode(p, add_special_tokens=True)
86 | r1_ids = self.tokenizer.encode(r1, add_special_tokens=True)
87 | r2_ids = self.tokenizer.encode(r2, add_special_tokens=True)
88 |
89 | if len(r1_ids) > self.max_response_length:
90 | print(
91 | f"Reference sequence length {len(r1_ids)} is "
92 | f"larger than max_response_length {self.max_response_length}",
93 | )
94 | if self.response_cut_side == "right":
95 | r1_ids = r1_ids[: self.max_response_length]
96 | else:
97 | r1_ids = r1_ids[-self.max_response_length:]
98 | if len(r2_ids) > self.max_response_length:
99 | print(
100 | f"Output sequence length {len(r2_ids)} is "
101 | f"larger than max_response_length {self.max_response_length}",
102 | )
103 | if self.response_cut_side == "right":
104 | r2_ids = r2_ids[: self.max_response_length]
105 | else:
106 | r2_ids = r2_ids[-self.max_response_length:]
107 |
108 | max_prompt_length = (self.max_length - len(r1_ids) - len(r2_ids)) // 2
109 |
110 | if len(p_ids) > max_prompt_length:
111 | print(
112 | f"Prompt sequence length {len(p_ids)} is "
113 | f"larger than max_prompt_length {max_prompt_length}",
114 | )
115 | p_ids = p_ids[-max_prompt_length:]
116 |
117 | p = self.tokenizer.decode(p_ids, skip_special_tokens=True)
118 | r1 = self.tokenizer.decode(r1_ids, skip_special_tokens=True)
119 | r2 = self.tokenizer.decode(r2_ids, skip_special_tokens=True)
120 |
121 | # Fit the template of RM
122 | _reference_cat = (
123 | p + r1 if wrapper == "pretrain" or len(r1) == "" else p + "\n" + r1
124 | )
125 | _output_cat = (
126 | p + r2 if wrapper == "pretrain" or len(r2) == "" else p + "\n" + r2
127 | )
128 |
129 | final_txt = _reference_cat + "<|reward|>" + _output_cat + "[UNUSED_TOKEN_130]"
130 |
131 | return final_txt
132 |
133 | def encode(self, data) -> Union[str, List[str]]:
134 | """Encode the input data into a format suitable for RM.
135 |
136 | Args:
137 | data: A dictionary or a list of dictionary containing the keys
138 | 'prompt', 'reference', 'output', and optionally 'wrapper'.
139 | Returns:
140 | The encoded input string for RM.
141 | """
142 | if isinstance(data, dict):
143 | return self._encode(**data)
144 | elif isinstance(data, list):
145 | return [
146 | self._encode(**item) if isinstance(item, dict) else item
147 | for item in data
148 | ]
149 | else:
150 | raise ValueError(
151 | "Input data must be a dictionary or a list of dictionaries."
152 | )
153 |
154 | def sglang_request_reward(
155 | self, data, retry_delay=0.2, max_retries=8
156 | ) -> List[float]:
157 | # Disable proxy for internal cluster communication
158 | for i in range(max_retries):
159 | try:
160 | res = requests.post(
161 | f"http://{self.server_address}/classify",
162 | json={
163 | "model": self.rm_name,
164 | "text": data,
165 | },
166 | proxies={"http": None, "https": None}, # Explicitly disable proxy
167 | timeout=30, # Add timeout
168 | )
169 | rewards = [e["embedding"][0] for e in res.json()]
170 | return rewards
171 | except Exception as e:
172 | print(f"Error requesting reward: {e}")
173 | print(f"Raw response: {data}")
174 | sleep(retry_delay)
175 | continue
176 | print(f"Failed to request reward after {max_retries} retries")
177 | return None
178 |
179 | def vllm_request_reward(self, data, retry_delay=0.2, max_retries=8) -> List[float]:
180 | # Disable proxy for internal cluster communication
181 | for i in range(max_retries):
182 | try:
183 | res = requests.post(
184 | f"http://{self.server_address}/pooling",
185 | json={
186 | "input": data,
187 | },
188 | proxies={"http": None, "https": None}, # Explicitly disable proxy
189 | timeout=30, # Add timeout
190 | )
191 | rewards = [e["data"][-1][0] for e in res.json()["data"]]
192 | return rewards
193 | except Exception as e:
194 | print(f"Error requesting reward: {e}")
195 | print(f"Raw response: {data}")
196 | sleep(retry_delay)
197 | continue
198 | print(f"Failed to request reward after {max_retries} retries")
199 | return None
200 |
201 | def lmdeploy_request_reward(
202 | self, data, retry_delay=0.2, max_retries=8
203 | ) -> List[float]:
204 | # Disable proxy for internal cluster communication
205 | for i in range(max_retries):
206 | try:
207 | res = requests.post(
208 | f"http://{self.server_address}/pooling",
209 | json={
210 | "input": data,
211 | },
212 | proxies={"http": None, "https": None}, # Explicitly disable proxy
213 | timeout=30, # Add timeout
214 | )
215 | rewards = [e["data"] for e in res.json()["data"]]
216 | return rewards
217 | except Exception as e:
218 | print(f"Error requesting reward: {e}")
219 | print(f"Raw response: {data}")
220 | sleep(retry_delay)
221 | continue
222 | print(f"Failed to request reward after {max_retries} retries")
223 | return None
224 |
225 | def __call__(self, data) -> List[float]:
226 | """Call the input wrapper to construct the input string for RM.
227 |
228 | Args:
229 | data: A list of dictionaries containing the keys
230 | 'prompt', 'reference', 'output', and optionally 'wrapper'.
231 | retry_delay: Delay in seconds before retrying the request.
232 | max_retries: Maximum number of retries for the request.
233 | Returns:
234 | scores: The list of reward scores returned by the RM server.
235 | If the request fails, it returns None.
236 | """
237 | data = self.encode(data)
238 | if self.server_type == "sglang":
239 | scores = self.sglang_request_reward(data)
240 | elif self.server_type == "vllm":
241 | scores = self.vllm_request_reward(data)
242 | elif self.server_type == "lmdeploy":
243 | scores = self.lmdeploy_request_reward(data)
244 | else:
245 | raise ValueError(f"Unsupported server type: {self.server_type}")
246 |
247 | return scores
248 |
249 |
250 | # Global variable to hold the RewardModelClient instance
251 | _reward_client = None
252 |
253 |
254 | def get_reward_client():
255 | """Get or create a RewardModelClient instance."""
256 | global _reward_client
257 | if _reward_client is None:
258 |
259 | _reward_client = RewardModelClient(
260 | path=MODEL_PATH,
261 | server_type=SERVER_TYPE,
262 | server_address=ADDRESS,
263 | )
264 |
265 | return _reward_client
266 |
267 |
268 | def extract_thinking_content(text: str) -> tuple[str, str]:
269 | pattern = r'(.*?)(.*)'
270 | match = re.search(pattern, text, re.DOTALL)
271 | if match:
272 | thinking_content = match.group(1).strip()
273 | remaining_content = match.group(2).strip()
274 | return thinking_content, remaining_content
275 | return "", text
276 |
277 |
278 | def compute_score_batch(data_sources, solution_strs, ground_truths, extra_infos, prompt_key="prompt"):
279 | """Compute scores for a batch of data using the POLAR reward model for VERL.
280 |
281 | Args:
282 | data_sources: List of data sources.
283 | solution_strs: List of solution strings.
284 | ground_truths: List of ground truth strings or {"role": xxx, "content": xxx} messages.
285 | extra_infos: List of extra information dictionaries containing prompt_key,
286 | which is the dictionary-style input prompt for policy model and POLAR.
287 |
288 | Returns:
289 | scores: A list of computed scores for each data source.
290 | """
291 |
292 | legacy_eval = False
293 |
294 | batch_data = []
295 | for data_source, solution_str, ground_truth, extra_info in zip(
296 | data_sources, solution_strs, ground_truths, extra_infos, strict=True
297 | ):
298 |
299 | _, solution_str = extract_thinking_content(solution_str)
300 |
301 | if extra_info["ability"] == "math" and extra_info["split"] == "test":
302 | legacy_eval = True
303 |
304 | data = {
305 | "prompt": extra_info[prompt_key],
306 | "reference": ground_truth,
307 | "output": solution_str,
308 | "wrapper": "sft"
309 | }
310 | batch_data.append(data)
311 |
312 | if legacy_eval:
313 | # If the task is math, use rule-based rewards as test evaluation.
314 | from verl.utils.reward_score.math_verify import compute_score
315 |
316 | # For enhanced accuracy, we utilize Math-Verify (https://github.com/huggingface/Math-Verify).
317 | # Note: Math-Verify needs to be manually installed via pip: `pip install math-verify`.
318 | return [
319 | compute_score(item["output"], item["reference"])
320 | for item in batch_data
321 | ]
322 |
323 | client = get_reward_client()
324 |
325 | scores = client(batch_data)
326 |
327 | return scores
328 |
329 |
330 | if __name__ == "__main__":
331 | client = get_reward_client()
332 | data = [
333 | {
334 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
335 | "reference": [{"role": "assistant", "content": "Beijing."}],
336 | "output": [{"role": "assistant", "content": "Beijing."}]
337 | },
338 | {
339 | "prompt": [{"role": "user", "content": "What is the capital of China?"}],
340 | "reference": [{"role": "assistant", "content": "Beijing."}],
341 | "output": [{"role": "assistant", "content": "Shanghai."}]
342 | }
343 | ]
344 | scores = client(data)
345 | print(scores)
346 |
--------------------------------------------------------------------------------