├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── README_zh-CN.md ├── assets ├── intro.jpeg ├── logo.png └── result.png ├── examples ├── data_preprocess │ ├── am_general.py │ ├── full_hh_rlhf.py │ └── math.py ├── ppo │ ├── llama3-8b_general.sh │ ├── llama3-8b_hh-rlhf.sh │ ├── llama3-8b_math.sh │ ├── qwen2_5-7b_general.sh │ ├── qwen2_5-7b_hh-rlhf.sh │ ├── qwen2_5-7b_math.sh │ ├── qwen3-8b_general.sh │ ├── qwen3-8b_hh-rlhf.sh │ └── qwen3-8b_math.sh └── xtuner_configs │ ├── POLAR_1_8B_full_varlenattn_custom_dataset.py │ └── POLAR_7B_full_varlenattn_custom_dataset.py └── src └── polar ├── __init__.py └── reward_func.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[codz] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | #poetry.toml 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 114 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control 115 | #pdm.lock 116 | #pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # pixi 121 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 122 | #pixi.lock 123 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 124 | # in the .venv directory. It is recommended not to include this directory in version control. 125 | .pixi 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .envrc 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | 178 | # Abstra 179 | # Abstra is an AI-powered process automation framework. 180 | # Ignore directories containing user credentials, local state, and settings. 181 | # Learn more at https://abstra.io/docs 182 | .abstra/ 183 | 184 | # Visual Studio Code 185 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 186 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 187 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 188 | # you could uncomment the following to ignore the entire vscode folder 189 | # .vscode/ 190 | 191 | # Ruff stuff: 192 | .ruff_cache/ 193 | 194 | # PyPI configuration file 195 | .pypirc 196 | 197 | # Cursor 198 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 199 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 200 | # refer to https://docs.cursor.com/context/ignore-files 201 | .cursorignore 202 | .cursorindexingignore 203 | 204 | # Marimo 205 | marimo/_static/ 206 | marimo/_lsp/ 207 | __marimo__/ 208 | 209 | # outputs 210 | outputs/ 211 | 212 | # vscode 213 | .vscode/ 214 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "verl"] 2 | path = verl 3 | url = https://github.com/volcengine/verl 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023-2025 Shanghai AI Laboratory 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 204 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 205 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 206 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 207 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 208 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 209 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 210 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 211 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 212 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | 6 | [![license](https://img.shields.io/github/license/InternLM/xtuner.svg)](./LICENSE) 7 | [![xtuner](https://img.shields.io/badge/support-xtuner-blue)](https://github.com/InternLM/xtuner/) 8 | [![lmdeploy](https://img.shields.io/badge/lmdeploy-blue)](https://github.com/InternLM/lmdeploy/) 9 | [![sglang](https://img.shields.io/badge/sglang-blue)](https://github.com/sgl-project/sglang/) 10 | [![vllm](https://img.shields.io/badge/vllm-blue)](https://github.com/vllm-project/vllm/) 11 | [![verl](https://img.shields.io/badge/verl-blue)](https://github.com/volcengine/verl) 12 | 13 | 14 | [🤗 HuggingFace](https://huggingface.co/collections/internlm/polar-68693f829d2e83ac5e6e124a) | 15 | [🤖 ModelScope](https://www.modelscope.cn/organization/Shanghai_AI_Laboratory) | 16 | [📜 Paper](https://arxiv.org/abs/2507.05197)
17 | 18 | 19 | [English](./README.md) | 20 | [简体中文](./README_zh-CN.md) 21 | 22 |
23 | 24 | # Latest News 🎉 25 | 26 | - **[2025/09]** Our POLAR paper has been accepted by Neurips 2025. 27 | - **[2025/09]** POLAR now supports RFT (Reinforcement Fine-tuning) training using VERL. 28 | 29 | 30 | # Introduction 31 | 32 | POLAR represents a significant breakthrough in scalar-based reward models achieved through large-scale pre-training. It leverages the innovative **POL**icy Discrimin**A**tive Lea**R**ning (**POLAR**) paradigm——a scalable, high-level optimization objective——to effectively discriminate between policies using a large-scale synthetic corpora. Following pre-training, POLAR RMs are fine-tuned with minimal preference data, rapidly aligning with human preferences. Key features of POLAR include: 33 | 34 | * **Innovative Pre-training Paradigm:** POLAR trains a reward model to discern identical policies and discriminate different ones. Unlike traditional reward modeling methods relying on absolute preferences, POLAR captures the relative difference between two policies, which is a scalable, high-level optimization objective suitable for modeling generic ranking relationships. 35 | 36 | * **Tailored for Reinforcement Fine-tuning:** POLAR assigns rewards to LLM trajectories based on given references, perfectly aligning with the Reinforcement Fine-tuning (RFT) framework. POLAR provides a promising solution for applying RFT in generic scenarios. 37 | 38 | * **Superior Performance and Generalization:** POLAR achieves state-of-the-art results on downstream reinforcement learning tasks, consistently delivering accurate and reliable reward signals that generalize effectively to unseen scenarios and significantly reducing reward hacking. 39 | 40 | * **Easy to Customize:** Pre-trained checkpoints of POLAR are available, enabling researchers to conveniently fine-tune the RM for various customized scenarios, thus facilitating straightforward adaptation and expansion tailored to specific applications and experimental requirements. 41 | 42 | 43 |
44 | 45 | 46 | # Model Zoo 47 | 48 | We release POLAR reward models in sizes of 1.8B and 7B parameters. The "base" models (POLAR-1.8B-Base and POLAR-7B-Base) refer to pre-trained-only checkpoints, ideal for customized fine-tuning according to specific preferences. The "ready-to-use" checkpoints (POLAR-1.8B and POLAR-7B) have been already fine-tuned on general preference data, making them suitable for immediate use in most scenarios. 49 | 50 | | Model | Transformers(HF) | ModelScope(HF) | 51 | | -------------------------- | ------------------------------------------ | ---------------------------------------- | 52 | | **POLAR-1.8B-Base** | [🤗 POLAR-1_8B-Base](https://huggingface.co/internlm/POLAR-1_8B-Base) | [🤖 POLAR-1_8B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B-Base/summary) | 53 | | **POLAR-1.8B** | [🤗 POLAR-1_8B](https://huggingface.co/internlm/POLAR-1_8B) | [🤖 POLAR-1_8B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B/summary) | 54 | | **POLAR-7B-Base** | [🤗 POLAR-7B-Base](https://huggingface.co/internlm/POLAR-7B-Base) | [🤖 POLAR-7B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B-Base/summary) | 55 | | **POLAR-7B** | [🤗 POLAR-7B](https://huggingface.co/internlm/POLAR-7B) | [🤖 POLAR-7B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B/summary) | 56 | 57 | 58 | # Performance 59 | 60 | We conducted a comprehensive evaluation of POLAR via the Proximal Policy Optimization (PPO) algorithm. We evaluate the downstream RL performances of four different policy models using [OpenCompass](https://github.com/internLM/OpenCompass/). More details are available in our [Paper](https://arxiv.org/abs/2507.05197). 61 | 62 |
63 | 64 | # Quick Start 65 | 66 | This repository provides a `RewardModelClient` class (`src/polar/reward_func.py`) for querying reward values from a remote POLAR server. It handles input encoding, communication with different backends (sglang, vllm, lmdeploy), and returns the reward scores. 67 | 68 | ```python 69 | from src.polar import RewardModelClient 70 | ``` 71 | 72 | Optionally, you can also use [XTuner](https://github.com/InternLM/xtuner)’s implementation by installing XTuner and importing the class from XTuner. 73 | 74 | ```python 75 | from xtuner.utils import RewardModelClient 76 | ``` 77 | 78 | For XTuner installation instructions, see the [Fine-tune](#fine-tune) section below. 79 | 80 | ## Inference 81 | 82 | We support reward inference through [lmdeploy](https://github.com/InternLM/lmdeploy/), [sglang](https://github.com/sgl-project/sglang/), and [vllm](https://github.com/vllm-project/vllm/). We recommend setting up a virtual environment with conda when using these inference engines to prevent potential dependency conflicts. 83 | 84 | ### Data format 85 | 86 | Unlike traditional reward models, POLAR requires an additional reference trajectory as a demonstration and evaluate candidate trajectories by measuring their consistency with the provided reference. 87 | 88 | ```python 89 | data = [ 90 | { 91 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 92 | "reference": [{"role": "assistant", "content": "Beijing."}], 93 | "output": [{"role": "assistant", "content": "Beijing."}] 94 | }, 95 | { 96 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 97 | "reference": [{"role": "assistant", "content": "Beijing."}], 98 | "output": [{"role": "assistant", "content": "Shanghai."}] 99 | } 100 | ] 101 | ``` 102 | 103 | ### Inference with transformers 104 | 105 | #### Reward request 106 | To load the POLAR model using transformers, use the following code to get rewards: 107 | 108 | ```python 109 | from transformers import AutoModel, AutoTokenizer 110 | from src.polar import RewardModelClient 111 | # from xtuner.utils import RewardModelClient 112 | 113 | model_name = 'internlm/POLAR-7B' 114 | 115 | model = AutoModel.from_pretrained( 116 | model_name, 117 | device_map="cuda", 118 | trust_remote_code=True 119 | ) 120 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 121 | 122 | client = RewardModelClient(model_name) 123 | encoded_data = client.encode(data) 124 | batch = tokenizer(encoded_data, return_tensors='pt', padding=True).to('cuda') 125 | outputs = model(**batch) 126 | rewards = outputs[0].squeeze(-1).cpu().tolist() 127 | print(rewards) 128 | # [-0.5702977776527405, -11.030370712280273] for previous example data 129 | ``` 130 | 131 | ### Inference with lmdeploy 132 | 133 | [LMDeploy](https://github.com/InternLM/lmdeploy) is a toolkit for compressing, deploying, and serving LLMs. 134 | 135 | #### Requirements 136 | 137 | - lmdeploy >= 0.9.1 138 | 139 | #### Server Launch 140 | 141 | ```bash 142 | lmdeploy serve api_server internlm/POLAR-7B --backend pytorch --server-port 30000 143 | ``` 144 | #### Client Request 145 | 146 | ```python 147 | from src.polar import RewardModelClient 148 | # from xtuner.utils import RewardModelClient 149 | 150 | client = RewardModelClient("internlm/POLAR-7B", 151 | server_type="lmdeploy", 152 | server_address="127.0.0.1:30000") 153 | 154 | # Request rewards directly 155 | rewards = client(data) 156 | print(rewards) 157 | 158 | # First encode data and then get rewards via the request function. 159 | encoded_data = client.encode(data) 160 | rewards = client.lmdeploy_request_reward(encoded_data) 161 | print(rewards) 162 | ``` 163 | 164 | ### Inference with sglang 165 | 166 | #### Requirements 167 | 168 | - 0.4.3.post4 <= sglang <= 0.4.4.post1 169 | 170 | #### Server Launch 171 | 172 | ```bash 173 | python3 -m sglang.launch_server --model internlm/POLAR-7B --trust-remote-code --is-embedding --dp 4 --tp 2 --mem-fraction-static 0.9 --port 30000 174 | ``` 175 | 176 | #### Client Request 177 | 178 | ```python 179 | from src.polar import RewardModelClient 180 | # from xtuner.utils import RewardModelClient 181 | 182 | client = RewardModelClient("internlm/POLAR-7B", 183 | server_type="sglang", 184 | server_address="127.0.0.1:30000") 185 | 186 | # Request rewards directly 187 | rewards = client(data) 188 | print(rewards) 189 | 190 | # First encode data and then get rewards via the request function. 191 | encoded_data = client.encode(data) 192 | rewards = client.sglang_request_reward(encoded_data) 193 | print(rewards) 194 | ``` 195 | 196 | ### Inference with vllm 197 | 198 | #### Requirements 199 | 200 | - vllm >= 0.8.0 201 | 202 | #### Server Launch 203 | 204 | ```bash 205 | vllm serve internlm/POLAR-7B --task=reward --trust-remote-code --tensor-parallel-size=2 --port 30000 206 | ``` 207 | 208 | #### Client Request 209 | 210 | ```python 211 | from src.polar import RewardModelClient 212 | # from xtuner.utils import RewardModelClient 213 | 214 | client = RewardModelClient("internlm/POLAR-7B", 215 | server_type="vllm", 216 | server_address="127.0.0.1:30000") 217 | 218 | # Request rewards directly 219 | rewards = client(data) 220 | print(rewards) 221 | 222 | # First encode data and then get rewards via the request function. 223 | encoded_data = client.encode(data) 224 | rewards = client.vllm_request_reward(encoded_data) 225 | print(rewards) 226 | ``` 227 | 228 | ## RFT with VERL 229 | 230 | POLAR can be easily integrated into various reinforcement learning frameworks. This repository provides an example showing how to use [VERL](https://github.com/volcengine/verl) for reinforcement fine-tuning (RFT) with POLAR reward models. 231 | 232 | ### Environment Setup 233 | 234 | Please refer to the [VERL official installation guide](https://github.com/volcengine/verl) for detailed environment setup instructions. 235 | 236 | > **Note**: For training Qwen2.5 series, we recommend using the inference backend **vLLM 0.8.3** and **Transformers 4.50.3** for optimal performance. A higher version of transformers may cause training instability of Qwen2.5 series. 237 | 238 | ### Data Format 239 | 240 | Training data should be in Parquet format with the following structure: 241 | ```python 242 | { 243 | "data_source": "dataset_name", 244 | "prompt": [{"role": "user", "content": "..."}, ...], 245 | "ability": "alility_type", 246 | "reward_model": { 247 | "style": "polar", 248 | "ground_truth": [{"role": "assistant", "content": "..."}] 249 | } 250 | "extra_info": { 251 | # The same as prompt. The purpose is for compatibible usage of verl and polar. 252 | "prompt": [{"role": "user", "content": "..."}, ...], 253 | } 254 | } 255 | ``` 256 | 257 | ### Training steps 258 | 259 | - **Step 1:** POLAR Deployment 260 | 261 | Deploy the POLAR reward model following the above [Inference](#inference) instructions. Update the server configuration in `src/polar/reward_func.py`: 262 | 263 | ```python 264 | # Config reward model server 265 | ADDRESS = "your_server_ip:port" # Modify according to your server address 266 | SERVER_TYPE = "sglang" # Options: "sglang", "vllm", "lmdeploy" 267 | MODEL_PATH = "internlm/POLAR-7B" 268 | ``` 269 | 270 | - **Step 2:** Data Preparation 271 | 272 | Prepare your training data in Parquet format. You can use the provided data preprocessing scripts: 273 | 274 | ```bash 275 | # Example: Process HH-RLHF dataset 276 | python examples/data_preprocess/full_hh_rlhf.py --local_dir ~/data/hh_rlhf 277 | ``` 278 | 279 | - **Step 3:** Configure Training Script 280 | 281 | An example of training script: `examples/ppo/qwen2_5-7b_hh-rlhf.sh`. 282 | 283 | - **Step 4:** Run Training 284 | 285 | ```bash 286 | cd verl 287 | bash ../examples/ppo/qwen2_5-7b_hh-rlhf.sh 288 | ``` 289 | 290 | ### Results 291 | 292 | Here we show the RFT results of Qwen3-8B trained by our [official configs](https://github.com/InternLM/POLAR/blob/main/examples/ppo/qwen3-8b_general.sh), with the public [AM-DeepSeek-R1-0528-Distilled](https://huggingface.co/datasets/a-m-team/AM-DeepSeek-R1-0528-Distilled) dataset. We use [OpenCompass](https://github.com/internLM/OpenCompass/) for evaluation. 293 | 294 | | Benchmark | Qwen3-8B w. thinking | Qwen3-8B w. thinking (RFT) | 295 | | --- | ---- | ---- | 296 | | alignment_bench | 7.04 | 7.48 | 297 | | alpaca_eval | 87.20 | 95.40 | 298 | | arenahard | 83.15 | 89.45 | 299 | | followbench | 0.93 | 0.95 | 300 | | mtbench | 8.73 | 8.78 | 301 | | wildbench | 58.43 | 72.09 | 302 | | mmlu | 86.06 | 86.58 | 303 | | mmlu_pro | 73.66 | 75.19 | 304 | | cmmlu | 82.72 | 83.07 | 305 | | bbeh | 29.56 | 33.30 | 306 | | korbench | 73.16 | 75.00 | 307 | | gpqa | 61.05 | 63.07 | 308 | | supergpqa | 47.82 | 49.67 | 309 | | olympiadbench | 69.90 | 70.45 | 310 | | aime2024 | 75.52 | 75.83 | 311 | | aime2025 | 67.50 | 68.71 | 312 | | mbpp | 83.66 | 93.00 | 313 | | lcb-code | 46.86 | 48.57 | 314 | 315 | 316 | ## Fine-tune 317 | 318 | You could employ the latest [xtuner](https://github.com/InternLM/xtuner) to fine-tune POLAR. Xtuner is an efficient, flexible and full-featured toolkit for fine-tuning LLMs. 319 | 320 | - It is recommended to build a Python-3.10 virtual environment using conda 321 | 322 | ```bash 323 | conda create --name xtuner-env python=3.10 -y 324 | conda activate xtuner-env 325 | ``` 326 | 327 | - Install xtuner via pip 328 | 329 | ```shell 330 | pip install 'xtuner[deepspeed]'==0.2.0 331 | ``` 332 | 333 | - Install xtuner from the latest source code 334 | 335 | ```shell 336 | pip install 'git+https://github.com/InternLM/xtuner.git@main#egg=xtuner[deepspeed]' 337 | ``` 338 | 339 | ### Requirements 340 | 341 | - flash_attn 342 | - tensorboard 343 | 344 | ### Data format 345 | 346 | Unlike traditional reward models, POLAR requires an additional reference trajectory as a demonstration during fine-tuning, along with a chosen trajectory and a rejected trajectory. You can construct your fine-tuning data in a `train.jsonl` file, formatted as follows: 347 | 348 | ```json 349 | { 350 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 351 | "reference": [{"role": "assistant", "content": "Beijing."}], 352 | "chosen": [{"role": "assistant", "content": "Beijing."}], 353 | "rejected": [{"role": "assistant", "content": "Shanghai."}] 354 | } 355 | ``` 356 | 357 | ### Training steps 358 | 359 | - **Step 0:** Prepare the config. We provide examplar ready-to-use configs [here](./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py). If the provided configs cannot meet the requirements, please copy the provided config and do modification following the [xtuner guideline](https://github.com/InternLM/xtuner/blob/main/docs/en/get_started/quickstart.md). For more details of reward model training settings, please see the xtuner [reward model guideline](https://github.com/InternLM/xtuner/blob/main/docs/en/reward_model/modify_settings.md). 360 | 361 | - **Step 1:** Start fine-tuning. 362 | 363 | ```shell 364 | xtuner train ${CONFIG_FILE_PATH} 365 | ``` 366 | 367 | For example, you can start the fine-tuning of POLAR-7B-Base by 368 | 369 | ```shell 370 | # On a single GPU 371 | xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2 372 | 373 | # On multiple GPUs 374 | NPROC_PER_NODE=${GPU_NUM} xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2 375 | ``` 376 | 377 | Here, `--deepspeed` means using [DeepSpeed](https://github.com/microsoft/DeepSpeed) to optimize the training. Xtuner comes with several integrated strategies including ZeRO-1, ZeRO-2, and ZeRO-3. If you wish to disable this feature, simply remove this argument. 378 | 379 | - **Step 2:** Convert the saved PTH model (if using DeepSpeed, it will be a directory) to Hugging Face model, by 380 | 381 | ```shell 382 | xtuner convert pth_to_hf ${CONFIG_FILE_PATH} ${PTH} ${SAVE_PATH} 383 | ``` 384 | 385 | # Examples 386 | 387 | ## Closed-ended questions 388 | 389 | ```python 390 | from src.polar import RewardModelClient 391 | # from xtuner.utils import RewardModelClient 392 | 393 | prompt = "How many 'r's are there in the word 'strawberry'?" 394 | reference = "There are 3 'r's in the word 'strawberry'. Here's how we can count them: 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. So, the answer is 3." 395 | outputs = [ 396 | # Same as the reference response. 397 | "There are 3 'r's in the word 'strawberry'. Here's how we can count them: 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. So, the answer is 3.", 398 | # Correct answer with correct thoughts. 399 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is three.", 400 | # Wrong answer with wrong thoughts. 401 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is two.", 402 | # Wrong answer with correct thoughts. 403 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is two.", 404 | # Correct answer with wrong thoughts. 405 | "Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is three.", 406 | # Correct answer without thoughts. 407 | "There are 3 'r's in the word 'strawberry'.", 408 | # Wrong answer without thoughts. 409 | "There are 2 'r's in the word 'strawberry'.", 410 | ] 411 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs] 412 | 413 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000") 414 | rewards = client(data) 415 | 416 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True) 417 | 418 | for output, reward in sorted_res: 419 | print(f"Output: {output}\nReward: {reward}\n") 420 | ``` 421 | 422 | ```txt 423 | Output: There are 3 'r's in the word 'strawberry'. Here's how we can count them: 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. So, the answer is 3. 424 | Reward: 0.054595947265625 425 | 426 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is three. 427 | Reward: -2.005859375 428 | 429 | Output: There are 3 'r's in the word 'strawberry'. 430 | Reward: -6.70703125 431 | 432 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is three. 433 | Reward: -7.10546875 434 | 435 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are three 'r's, so the answer is two. 436 | Reward: -7.1328125 437 | 438 | Output: Let's count the 'r's in 'strawberry': 's', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y'. There are two 'r's, so the answer is two. 439 | Reward: -8.46875 440 | 441 | Output: There are 2 'r's in the word 'strawberry'. 442 | Reward: -10.8203125 443 | ``` 444 | 445 | ## Open-ended questions 446 | ```python 447 | from src.polar import RewardModelClient 448 | # from xtuner.utils import RewardModelClient 449 | 450 | prompt = "Summarize the first book of Frank Herbert’s Dune in one witty short sentence." 451 | reference = "Royal teen discovers that life’s a beach—minus the ocean, plus spice, giant sandworms and deadly politics." 452 | outputs = [ 453 | # Same as the reference response. 454 | "Royal teen discovers that life’s a beach—minus the ocean, plus spice, giant sandworms and deadly politics.", 455 | # Closely resembles the reference response but includes factual errors. 456 | "Royal teen discovers that life’s a beach—minus the ocean, plus magic, dark wizards and deadly politics.", 457 | # A distinct yet concise and witty summary that draws analogies from other dramas—markedly different from the reference response. 458 | "Young noble’s move to desert planet turns into galactic Game of Thrones with fewer dragons, more worms.", 459 | # A concise summary, but lacking wit—fails to meet the requirement. 460 | "A noble family’s fall sparks a young heir’s rise as a leader on a harsh desert planet governed by prophecy and survival.", 461 | # A witty summary, but overly long—fails to meet the requirement. 462 | "Paul Atreides loses his father, gains prophetic powers, learns to ride a sandworm, leads a holy war, and discovers that being the chosen one comes with a lot of blood, sand, and questionable decisions.", 463 | # A concise and witty summary that draws from multiple Dune books rather than just the first—fails to follow the instruction. 464 | "Boy gets planet, becomes god, loses soul — family drama ensues across galaxies." 465 | ] 466 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs] 467 | 468 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000") 469 | rewards = client(data) 470 | 471 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True) 472 | 473 | for output, reward in sorted_res: 474 | print(f"Output: {output}\nReward: {reward}\n") 475 | ``` 476 | 477 | ```txt 478 | Output: Royal teen discovers that life’s a beach—minus the ocean, plus spice, giant sandworms and deadly politics. 479 | Reward: 0.466552734375 480 | 481 | Output: Young noble’s move to desert planet turns into galactic Game of Thrones with fewer dragons, more worms. 482 | Reward: -6.91796875 483 | 484 | Output: Royal teen discovers that life’s a beach—minus the ocean, plus magic, dark wizards and deadly politics. 485 | Reward: -7.70703125 486 | 487 | Output: Paul Atreides loses his father, gains prophetic powers, learns to ride a sandworm, leads a holy war, and discovers that being the chosen one comes with a lot of blood, sand, and questionable decisions. 488 | Reward: -8.4296875 489 | 490 | Output: A noble family’s fall sparks a young heir’s rise as a leader on a harsh desert planet governed by prophecy and survival. 491 | Reward: -8.6484375 492 | 493 | Output: Boy gets planet, becomes god, loses soul — family drama ensues across galaxies. 494 | Reward: -10.359375 495 | ``` 496 | 497 | # License 498 | 499 | Code and model weights are licensed under Apache-2.0. 500 | 501 | # Citation 502 | 503 | ``` 504 | @article{dou2025pretrained, 505 | title={Pre-Trained Policy Discriminators are General Reward Models}, 506 | author={Dou, Shihan and Liu, Shichun and Yang, Yuming and Zou, Yicheng and Zhou, Yunhua and Xing, Shuhao and Huang, Chenhao and Ge, Qiming and Song, Demin and Lv, Haijun and others}, 507 | journal={arXiv preprint arXiv:2507.05197}, 508 | year={2025} 509 | } 510 | ``` 511 | -------------------------------------------------------------------------------- /README_zh-CN.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | 6 | [![license](https://img.shields.io/github/license/InternLM/xtuner.svg)](./LICENSE) 7 | [![xtuner](https://img.shields.io/badge/support-xtuner-blue)](https://github.com/InternLM/xtuner/) 8 | [![lmdeploy](https://img.shields.io/badge/lmdeploy-blue)](https://github.com/InternLM/lmdeploy/) 9 | [![sglang](https://img.shields.io/badge/sglang-blue)](https://github.com/sgl-project/sglang/) 10 | [![vllm](https://img.shields.io/badge/vllm-blue)](https://github.com/vllm-project/vllm/) 11 | [![verl](https://img.shields.io/badge/verl-blue)](https://github.com/volcengine/verl) 12 | 13 | 14 | [🤗 HuggingFace](https://huggingface.co/collections/internlm/polar-68693f829d2e83ac5e6e124a) | 15 | [🤖 ModelScope](https://www.modelscope.cn/organization/Shanghai_AI_Laboratory) | 16 | [📜 Paper](https://arxiv.org/abs/2507.05197)
17 | 18 | 19 | [English](./README.md) | 20 | [简体中文](./README_zh-CN.md) 21 | 22 |
23 | 24 | # 最新进展 🎉 25 | 26 | - **[2025/09]** POLAR 论文现已被 Neurips 2025 会议接收。 27 | - **[2025/09]** POLAR 现已支持使用 VERL 进行 RFT(强化微调)训练。 28 | 29 | 30 | # 简介 31 | 32 | POLAR 是一个经过大规模预训练的奖励模型,在训练范式和模型性能上取得了重大突破。我们利用全新的策略判别学习方法(Policy Discriminative Learning,POLAR),使用大规模合成语料进行高效扩展预训练,使奖励模型能够有效区分不同的语言模型和策略分布。经过预训练的 POLAR 可通过少量的偏好数据进行微调,以快速对齐人类偏好。POLAR 的主要特点包括: 33 | 34 | * **全新的预训练范式**:POLAR 让奖励模型学会识别相同的策略并区分不同的策略。与传统的依赖绝对偏好的奖励建模方法不同,POLAR 能够学习两个策略之间的相对差异,是一种可扩展的、高层次的优化目标。 35 | 36 | * **专为强化学习微调(RFT)设计:** POLAR 根据给定的参考答案为语言模型的输出打分,完美契合强化学习微调(RFT)框架,为强化学习微调在通用场景的应用提供了一种有效解决方案。 37 | 38 | * **卓越的性能与泛化能力:** POLAR 在下游强化学习任务中展现出领先的水平,可稳定地提供准确可靠的奖励信号。POLAR 具有极强的泛化能力,可有效泛化到分布外场景,并显著减少奖励黑客(Reward Hacking)的现象。 39 | 40 | * **易于定制化:** 我们提供了 POLAR 的预训练权重(POLAR-Base)。研究人员可以根据自身需求,便捷地对其进行微调以适配各种定制化场景。 41 | 42 |

43 | 44 | 45 | # 模型库 46 | 47 | 此次发布的 POLAR 模型参数规模分别为 1.8B 和 7B。**POLAR-1.8B-Base** 和 **POLAR-7B-Base** 是仅经过预训练阶段的权重,适合根据特定需求进行微调。**POLAR-1.8B** 和 **POLAR-7B** 是经过偏好微调的奖励模型,可开箱即用,适用于大部分通用场景。 48 | 49 | | 模型 | Transformers(HF) | ModelScope(HF) | 50 | | -------------------------- | ------------------------------------------ | ---------------------------------------- | 51 | | **POLAR-1.8B-Base** | [🤗 POLAR-1_8B-Base](https://huggingface.co/internlm/POLAR-1_8B-Base) | [🤖 POLAR-1_8B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B-Base/summary) | 52 | | **POLAR-1.8B** | [🤗 POLAR-1_8B](https://huggingface.co/internlm/POLAR-1_8B) | [🤖 POLAR-1_8B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-1_8B/summary) | 53 | | **POLAR-7B-Base** | [🤗 POLAR-7B-Base](https://huggingface.co/internlm/POLAR-7B-Base) | [🤖 POLAR-7B-Base](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B-Base/summary) | 54 | | **POLAR-7B** | [🤗 POLAR-7B](https://huggingface.co/internlm/POLAR-7B) | [🤖 POLAR-7B](https://modelscope.cn/models/Shanghai_AI_Laboratory/POLAR-7B/summary) | 55 | 56 | 57 | # 性能 58 | 59 | 我们通过 Proximal Policy Optimization(PPO)算法对 POLAR 的使用效果进行了验证,评测了四种语言模型的下游强化学习性能,评测工具是 [OpenCompass](https://github.com/internLM/OpenCompass/) 。详细信息请参阅[论文](https://arxiv.org/abs/2507.05197)。 60 | 61 |
62 | 63 | # 快速开始 64 | 65 | ## 安装 66 | 67 | 本仓库提供了一个`RewardModelClient`类(`src/polar/reward_func.py`),用于向远程 POLAR 服务请求奖励分数。该类负责对输入的文本进行编码,支持与多种推理后端(sglang、vllm、lmdeploy)进行通信,并返回奖励分数。 68 | 69 | ```python 70 | from src.polar import RewardModelClient 71 | ``` 72 | 73 | 您也可以选择使用 [XTuner](https://github.com/InternLM/xtuner) 提供的实现,只需安装 XTuner 并从中导入该类: 74 | 75 | ```python 76 | from xtuner.utils import RewardModelClient 77 | ``` 78 | 79 | 关于 XTuner 的安装方法,请参考下方的[偏好微调](#偏好微调)部分。 80 | 81 | 82 | ## 推理 83 | 84 | 我们支持通过 [lmdeploy](https://github.com/InternLM/lmdeploy/)、[sglang](https://github.com/sgl-project/sglang/)、[vllm](https://github.com/vllm-project/vllm/) 对 POLAR 进行推理并获取奖励信号。建议在使用这些推理引擎时,创建 conda 虚拟环境,以避免可能出现的依赖冲突问题。 85 | 86 | ### 数据格式 87 | 88 | 与传统奖励模型不同,POLAR 需要额外的参考答案。POLAR 对模型输出轨迹与参考答案的一致性进行评估,并给出奖励分数。 89 | 90 | ```python 91 | data = [ 92 | { 93 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 94 | "reference": [{"role": "assistant", "content": "Beijing."}], 95 | "output": [{"role": "assistant", "content": "Beijing."}] 96 | }, 97 | { 98 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 99 | "reference": [{"role": "assistant", "content": "Beijing."}], 100 | "output": [{"role": "assistant", "content": "Shanghai."}] 101 | } 102 | ] 103 | ``` 104 | 105 | ### 使用 transformers 进行推理 106 | 107 | #### 示例代码 108 | 109 | ```python 110 | from transformers import AutoModel, AutoTokenizer 111 | from src.polar import RewardModelClient 112 | # from xtuner.utils import RewardModelClient 113 | 114 | model_name = 'internlm/POLAR-7B' 115 | 116 | model = AutoModel.from_pretrained( 117 | model_name, 118 | device_map="cuda", 119 | trust_remote_code=True 120 | ) 121 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 122 | 123 | client = RewardModelClient(model_name) 124 | encoded_data = client.encode(data) 125 | batch = tokenizer(encoded_data, return_tensors='pt', padding=True).to('cuda') 126 | outputs = model(**batch) 127 | rewards = outputs[0].squeeze(-1).cpu().tolist() 128 | print(rewards) 129 | # [-0.5702977776527405, -11.030370712280273] for previous example data 130 | ``` 131 | 132 | ### 使用 lmdeploy 进行推理 133 | 134 | [LMDeploy](https://github.com/InternLM/lmdeploy) 是一个高效压缩、部署语言模型的工具。 135 | 136 | #### 环境依赖 137 | 138 | - lmdeploy >= 0.9.1 139 | 140 | #### 启动服务端 141 | 142 | ```bash 143 | lmdeploy serve api_server internlm/POLAR-7B --backend pytorch --server-port 30000 144 | ``` 145 | #### 客户端请求示例 146 | 147 | ```python 148 | from src.polar import RewardModelClient 149 | # from xtuner.utils import RewardModelClient 150 | 151 | client = RewardModelClient("internlm/POLAR-7B", 152 | server_type="lmdeploy", 153 | server_address="127.0.0.1:30000") 154 | 155 | # Request rewards directly 156 | rewards = client(data) 157 | print(rewards) 158 | 159 | # First encode data and then get rewards via the request function. 160 | encoded_data = client.encode(data) 161 | rewards = client.lmdeploy_request_reward(encoded_data) 162 | print(rewards) 163 | ``` 164 | 165 | ### 使用 sglang 进行推理 166 | 167 | #### 环境依赖 168 | 169 | - 0.4.3.post4 <= sglang <= 0.4.4.post1 170 | 171 | #### 启动服务端 172 | 173 | ```bash 174 | python3 -m sglang.launch_server --model internlm/POLAR-7B --trust-remote-code --is-embedding --dp 4 --tp 2 --mem-fraction-static 0.9 --port 30000 175 | ``` 176 | 177 | #### 客户端请求示例 178 | 179 | ```python 180 | from src.polar import RewardModelClient 181 | # from xtuner.utils import RewardModelClient 182 | 183 | client = RewardModelClient("internlm/POLAR-7B", 184 | server_type="sglang", 185 | server_address="127.0.0.1:30000") 186 | 187 | # Request rewards directly 188 | rewards = client(data) 189 | print(rewards) 190 | 191 | # First encode data and then get rewards via the request function. 192 | encoded_data = client.encode(data) 193 | rewards = client.sglang_request_reward(encoded_data) 194 | print(rewards) 195 | ``` 196 | 197 | ### 使用 vllm 进行推理 198 | 199 | #### 环境依赖 200 | 201 | - vllm >= 0.8.0 202 | 203 | #### 启动服务端 204 | 205 | ```bash 206 | vllm serve internlm/POLAR-7B --task=reward --trust-remote-code --tensor-parallel-size=2 --port 30000 207 | ``` 208 | 209 | #### 客户端请求示例 210 | 211 | ```python 212 | from src.polar import RewardModelClient 213 | # from xtuner.utils import RewardModelClient 214 | 215 | client = RewardModelClient("internlm/POLAR-7B", 216 | server_type="vllm", 217 | server_address="127.0.0.1:30000") 218 | 219 | # Request rewards directly 220 | rewards = client(data) 221 | print(rewards) 222 | 223 | # First encode data and then get rewards via the request function. 224 | encoded_data = client.encode(data) 225 | rewards = client.vllm_request_reward(encoded_data) 226 | print(rewards) 227 | ``` 228 | 229 | ## 使用 VERL 进行强化微调(RFT) 230 | 231 | POLAR 可以方便地接入各类强化学习训练框架。本仓库提供了一个示例,演示如何结合 [VERL](https://github.com/volcengine/verl) 与 POLAR 奖励模型进行强化微调(RFT)。 232 | 233 | ### 环境配置 234 | 235 | 详细的环境配置方法请参考 [VERL 官方安装指南](https://github.com/volcengine/verl)。 236 | 237 | > **注意**: 在训练 Qwen2.5 系列模型时,推荐使用推理后端 **vLLM 0.8.3** 搭配 **Transformers 4.50.3**,以获得最佳性能。更高版本的 Transformers 可能会导致 Qwen2.5 系列训练不稳定。 238 | 239 | ### 数据格式 240 | 241 | 训练数据应为 Parquet 格式,结构如下: 242 | ```python 243 | { 244 | "data_source": "dataset_name", 245 | "prompt": [{"role": "user", "content": "..."}, ...], 246 | "ability": "alility_type", 247 | "reward_model": { 248 | "style": "polar", 249 | "ground_truth": [{"role": "assistant", "content": "..."}] 250 | } 251 | "extra_info": { 252 | # 与 prompt 相同,用于兼容 VERL 与 POLAR 253 | "prompt": [{"role": "user", "content": "..."}, ...], 254 | } 255 | } 256 | ``` 257 | 258 | ### 训练步骤 259 | 260 | - **Step 1:** 部署 POLAR 261 | 262 | 按照上述[推理](#推理)部分的说明,启动 POLAR 奖励模型服务,并在 `src/polar/reward_func.py` 中更新服务配置: 263 | 264 | ```python 265 | # 配置奖励模型服务 266 | ADDRESS = "your_server_ip:port" # 修改为实际的服务器地址 267 | SERVER_TYPE = "sglang" # 可选:"sglang", "vllm", "lmdeploy" 268 | MODEL_PATH = "internlm/POLAR-7B" 269 | ``` 270 | 271 | - **Step 2:** 数据准备 272 | 273 | 将训练数据准备为 Parquet 格式,可使用提供的预处理脚本: 274 | 275 | ```bash 276 | # 示例:处理 HH-RLHF 数据集 277 | python examples/data_preprocess/full_hh_rlhf.py --local_dir ~/data/hh_rlhf 278 | ``` 279 | 280 | - **Step 3:** 配置训练脚本 281 | 282 | 示例训练脚本可参考:`examples/ppo/qwen2_5-7b_hh-rlhf.sh`. 283 | 284 | - **Step 4:** 启动训练 285 | 286 | ```bash 287 | cd verl 288 | bash ../examples/ppo/qwen2_5-7b_hh-rlhf.sh 289 | ``` 290 | 291 | ### 参考结果 292 | 293 | 这里展示了使用 POLAR-7B 对 Qwen3-8B 进行强化微调的结果,使用了我们提供的[官方配置](https://github.com/InternLM/POLAR/blob/main/examples/ppo/qwen3-8b_general.sh), 以及开源的 [AM-DeepSeek-R1-0528-Distilled](https://huggingface.co/datasets/a-m-team/AM-DeepSeek-R1-0528-Distilled) 数据集. 评估过程由 [OpenCompass](https://github.com/internLM/OpenCompass/) 完成。 294 | 295 | | 评测集 | Qwen3-8B 思考模式 | Qwen3-8B 思考模式 (RFT) | 296 | | --- | ---- | ---- | 297 | | alignment_bench | 7.04 | 7.48 | 298 | | alpaca_eval | 87.20 | 95.40 | 299 | | arenahard | 83.15 | 89.45 | 300 | | followbench | 0.93 | 0.95 | 301 | | mtbench | 8.73 | 8.78 | 302 | | wildbench | 58.43 | 72.09 | 303 | | mmlu | 86.06 | 86.58 | 304 | | mmlu_pro | 73.66 | 75.19 | 305 | | cmmlu | 82.72 | 83.07 | 306 | | bbeh | 29.56 | 33.30 | 307 | | korbench | 73.16 | 75.00 | 308 | | gpqa | 61.05 | 63.07 | 309 | | supergpqa | 47.82 | 49.67 | 310 | | olympiadbench | 69.90 | 70.45 | 311 | | aime2024 | 75.52 | 75.83 | 312 | | aime2025 | 67.50 | 68.71 | 313 | | mbpp | 83.66 | 93.00 | 314 | | lcb-code | 46.86 | 48.57 | 315 | 316 | ## 偏好微调 317 | 318 | 推荐使用最新的 [xtuner](https://github.com/InternLM/xtuner) 来微调 POLAR。xtuner 是一个高效、灵活、具有多种使用特性的语言模型微调工具。 319 | 320 | - 建议使用 conda 创建 Python-3.10 虚拟环境: 321 | 322 | ```bash 323 | conda create --name xtuner-env python=3.10 -y 324 | conda activate xtuner-env 325 | ``` 326 | 327 | - 通过 pip 安装 xtuner: 328 | 329 | ```shell 330 | pip install 'xtuner[deepspeed]'==0.2.0 331 | ``` 332 | 333 | - 通过最新源码安装 xtuner: 334 | 335 | ```shell 336 | pip install 'git+https://github.com/InternLM/xtuner.git@main#egg=xtuner[deepspeed]' 337 | ``` 338 | 339 | ### 环境依赖 340 | 341 | - flash_attn 342 | - tensorboard 343 | 344 | ### 数据格式 345 | 346 | 与传统的奖励模型不同,除了 chosen 轨迹和 rejected 轨迹,POLAR 在微调过程中还需要一个额外的参考答案作为示范。你可以通过构建一个 `train.jsonl` 的文件来准备微调数据,格式如下: 347 | 348 | ```json 349 | { 350 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 351 | "reference": [{"role": "assistant", "content": "Beijing."}], 352 | "chosen": [{"role": "assistant", "content": "Beijing."}], 353 | "rejected": [{"role": "assistant", "content": "Shanghai."}] 354 | } 355 | ``` 356 | 357 | ### 训练步骤 358 | 359 | - **第一步:** 准备配置文件。我们提供了可直接使用的[示例配置](./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py)。如果需要进一步对超参进行修改,请复制一份示例配置文件,并根据 [xtuner 使用指南](https://github.com/InternLM/xtuner/blob/main/docs/en/get_started/quickstart.md) 进行修改。有关奖励模型训练设置的更多信息,请参考 [xtuner 奖励模型](https://github.com/InternLM/xtuner/blob/main/docs/en/reward_model/modify_settings.md)。 360 | 361 | - **第二步:** 启动微调。 362 | 363 | ```shell 364 | xtuner train ${CONFIG_FILE_PATH} 365 | ``` 366 | 367 | 例如,你可以按照如下的方式微调 POLAR-7B-Base: 368 | ```shell 369 | # On a single GPU 370 | xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2 371 | 372 | # On multiple GPUs 373 | NPROC_PER_NODE=${GPU_NUM} xtuner train ./examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py --deepspeed deepspeed_zero2 374 | ``` 375 | 376 | 这里,`--deepspeed` 表示使用 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 来加速训练。xtuner 内置了多种 DeepSpeed 策略,包括 ZeRO-1、ZeRO-2 和 ZeRO-3。如果您想禁用此功能,只需移除此参数即可。 377 | 378 | - **第三步:** 将保存的 PTH 模型(若使用 DeepSpeed,则保存结果会是一个目录)转换为 HuggingFace 模型,命令如下: 379 | 380 | ```shell 381 | xtuner convert pth_to_hf ${CONFIG_FILE_PATH} ${PTH} ${SAVE_PATH} 382 | ``` 383 |
384 | 385 | # 效果示例 386 | 387 | ## 客观问答 388 | 389 | ```python 390 | from src.polar import RewardModelClient 391 | # from xtuner.utils import RewardModelClient 392 | 393 | prompt = "单词“strawberry”中有几个“r”?" 394 | reference = "单词“strawberry”中包含3个字母“r”。我们可以逐字母数一下:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。因此,答案是3。" 395 | outputs = [ 396 | # 与参考完全一致 397 | "单词“strawberry”中包含3个字母“r”。我们可以逐字母数一下:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。因此,答案是3。", 398 | # 思路正确,答案正确 399 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是三。", 400 | # 思路错误,答案错误 401 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是二。", 402 | # 思路错误,答案正确 403 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是三。", 404 | # 思路正确,答案错误 405 | "我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是二。", 406 | # 答案正确 407 | "单词“strawberry”中有3个“r”", 408 | # 答案错误 409 | "单词“strawberry”中有2个“r”" 410 | ] 411 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs] 412 | 413 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000") 414 | rewards = client(data) 415 | 416 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True) 417 | 418 | for output, reward in sorted_res: 419 | print(f"Output: {output}\nReward: {reward}\n") 420 | ``` 421 | 422 | ```txt 423 | Output: 单词“strawberry”中包含3个字母“r”。我们可以逐字母数一下:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。因此,答案是3。 424 | Reward: -1.5380859375 425 | 426 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是三。 427 | Reward: -2.767578125 428 | 429 | Output: 单词“strawberry”中有3个“r” 430 | Reward: -7.45703125 431 | 432 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有三个“r”,因此答案是二。 433 | Reward: -7.6328125 434 | 435 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是三。 436 | Reward: -8.65625 437 | 438 | Output: 我们来数一数单词“strawberry”中有几个“r”:“s”、“t”、“r”、“a”、“w”、“b”、“e”、“r”、“r”、“y”。这里一共有两个“r”,因此答案是二。 439 | Reward: -9.2890625 440 | 441 | Output: 单词“strawberry”中有2个“r” 442 | Reward: -11.921875 443 | ``` 444 | 445 | ## 主观问答 446 | ```python 447 | from src.polar import RewardModelClient 448 | # from xtuner.utils import RewardModelClient 449 | 450 | prompt = "帮我想3个形容雨很大的成语,要求不能重复。" 451 | reference = "1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨" 452 | outputs = [ 453 | # 与参考相同 454 | "1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨", 455 | # 正确回答 456 | "1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注", 457 | # 非成语 458 | "1. 急雨如瀑 2. 豪雨倾天 3. 雨势磅礴", 459 | # 与参考类似,多一个。 460 | "1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨 4. 大雨滂沱", 461 | # 与参考类似,重复一个。 462 | "1. 倾盆大雨 2. 暴雨如注 3. 暴雨如注", 463 | # 与参考类似,少一个。 464 | "1. 倾盆大雨 2. 暴雨如注", 465 | # 成语正确,多一个。 466 | "1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注 4. 倾盆大雨", 467 | # 成语正确,重复一个 468 | "1. 大雨滂沱 2. 狂风骤雨 3. 狂风骤雨", 469 | # 成语正确,少一个 470 | "1. 大雨滂沱 2. 狂风骤雨" 471 | ] 472 | data = [{"prompt": prompt, "reference": reference, "output": output} for output in outputs] 473 | 474 | client = RewardModelClient("internlm/POLAR-7B", server_type="sglang", server_address="127.0.0.1:30000") 475 | rewards = client(data) 476 | 477 | sorted_res = sorted(zip(outputs, rewards), key=lambda x: x[1], reverse=True) 478 | 479 | for output, reward in sorted_res: 480 | print(f"Output: {output}\nReward: {reward}\n") 481 | ``` 482 | 483 | ```txt 484 | Output: 1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨 485 | Reward: -1.42578125 486 | 487 | Output: 1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注 488 | Reward: -5.234375 489 | 490 | Output: 1. 倾盆大雨 2. 暴雨如注 3. 瓢泼大雨 4. 大雨滂沱 491 | Reward: -5.62890625 492 | 493 | Output: 1. 急雨如瀑 2. 豪雨倾天 3. 雨势磅礴 494 | Reward: -5.7109375 495 | 496 | Output: 1. 倾盆大雨 2. 暴雨如注 497 | Reward: -6.61328125 498 | 499 | Output: 1. 倾盆大雨 2. 暴雨如注 3. 暴雨如注 500 | Reward: -6.65234375 501 | 502 | Output: 1. 大雨滂沱 2. 狂风骤雨 503 | Reward: -6.828125 504 | 505 | Output: 1. 大雨滂沱 2. 狂风骤雨 3. 大雨如注 4. 倾盆大雨 506 | Reward: -7.0234375 507 | 508 | Output: 1. 大雨滂沱 2. 狂风骤雨 3. 狂风骤雨 509 | Reward: -7.23046875 510 | ``` 511 | 512 | # 许可证 513 | 514 | 代码和模型权重均采用 Apache-2.0 许可证。 515 | 516 | # 引用 517 | 518 | ``` 519 | @article{dou2025pretrained, 520 | title={Pre-Trained Policy Discriminators are General Reward Models}, 521 | author={Dou, Shihan and Liu, Shichun and Yang, Yuming and Zou, Yicheng and Zhou, Yunhua and Xing, Shuhao and Huang, Chenhao and Ge, Qiming and Song, Demin and Lv, Haijun and others}, 522 | journal={arXiv preprint arXiv:2507.05197}, 523 | year={2025} 524 | } 525 | ``` 526 | -------------------------------------------------------------------------------- /assets/intro.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/POLAR/80626f39de581ac56d7bf1ca36a5e4f83d42d5c5/assets/intro.jpeg -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/POLAR/80626f39de581ac56d7bf1ca36a5e4f83d42d5c5/assets/logo.png -------------------------------------------------------------------------------- /assets/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/POLAR/80626f39de581ac56d7bf1ca36a5e4f83d42d5c5/assets/result.png -------------------------------------------------------------------------------- /examples/data_preprocess/am_general.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 POLAR Team and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import json 18 | 19 | from datasets import Dataset 20 | from huggingface_hub import snapshot_download 21 | 22 | filtered_num = 0 23 | data_source = "a-m-team/AM-DeepSeek-R1-0528-Distilled" 24 | 25 | 26 | def read_dataset(local_dir): 27 | global filtered_num 28 | idx = 0 29 | for file in os.listdir(local_dir): 30 | if file.endswith(".jsonl"): 31 | with open(os.path.join(local_dir, file), "r", encoding="utf-8") as f: 32 | for line in f: 33 | example = json.loads(line) 34 | try: 35 | conversations = example.pop("conversations") 36 | 37 | dialogs = [] 38 | for item in conversations: 39 | if item["from"] == "human": 40 | dialogs.append({"role": "user", "content": item["value"]}) 41 | else: 42 | if "info" in item: 43 | dialogs.append({"role": "assistant", "content": item["info"]["answer_content"]}) 44 | else: 45 | content = item["value"].split("")[1].split("")[0].strip() 46 | dialogs.append({"role": "assistant", "content": content}) 47 | 48 | assert dialogs[-1]["role"] == "assistant" 49 | data = { 50 | "data_source": data_source, 51 | "prompt": dialogs[:-1], 52 | "ability": "general", 53 | "reward_model": { 54 | "style": "polar", 55 | "ground_truth": dialogs[-1:], 56 | }, 57 | "extra_info": { 58 | "split": "train", 59 | "index": idx, 60 | "ability": "general", 61 | "prompt": dialogs[:-1], 62 | }, 63 | } 64 | yield data 65 | idx += 1 66 | except Exception as e: 67 | print(f"Error processing example {idx}: {e}") 68 | filtered_num += 1 69 | 70 | 71 | def generate_dataset(local_dir="~/data/general"): 72 | 73 | data_dir = snapshot_download( 74 | repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", 75 | repo_type="dataset", 76 | revision="main", 77 | local_dir="~/data/AM-DeepSeek-R1-0528-Distilled", 78 | local_dir_use_symlinks=False, 79 | allow_patterns=["*.jsonl"], 80 | ignore_patterns=["*.png"] 81 | ) 82 | 83 | final_dataset = Dataset.from_generator(lambda: read_dataset(data_dir)) 84 | local_dir = os.path.expanduser(local_dir) 85 | local_path = os.path.join(local_dir, "train.parquet") 86 | final_dataset.shuffle(seed=42).to_parquet(local_path) 87 | print(f"Filtered {filtered_num} examples due to errors.") 88 | 89 | 90 | if __name__ == "__main__": 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument("--local_dir", type=str, default="~/data/general") 93 | args = parser.parse_args() 94 | 95 | generate_dataset(args.local_dir) 96 | -------------------------------------------------------------------------------- /examples/data_preprocess/full_hh_rlhf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 POLAR Team and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import re 18 | 19 | from datasets import load_dataset 20 | 21 | 22 | def parse_dialogue(text: str): 23 | # Fixed pattern to correctly handle content with newlines 24 | # Uses \Z instead of $ for proper end-of-string matching 25 | pattern = r'^(Human|Assistant):\s*(.*?)(?=\n\n(?:Human|Assistant):|\Z)' 26 | matches = re.finditer(pattern, text, flags=re.MULTILINE | re.DOTALL) 27 | return [{"role": m.group(1).lower(), "content": m.group(2).strip()} for m in matches] 28 | 29 | 30 | def generate_dataset(local_dir="~/data/full_hh_rlhf"): 31 | dataset = load_dataset("Anthropic/hh-rlhf") 32 | train_dataset = dataset["train"] 33 | 34 | data_source = "Anthropic/hh-rlhf" 35 | 36 | # add a row to each data item that represents a unique id 37 | def make_map_fn(split): 38 | def process_fn(example, idx): 39 | chosen = example.pop("chosen") 40 | example.pop("rejected") 41 | 42 | dialogs = parse_dialogue(chosen) 43 | dialogs = [{"role": "user", "content": d["content"]} if d["role"] == "human" else d for d in dialogs] 44 | 45 | data = { 46 | "data_source": data_source, 47 | "prompt": dialogs[:-1], 48 | "ability": "alignment", 49 | "reward_model": { 50 | "style": "polar", 51 | "ground_truth": dialogs[-1:], 52 | }, 53 | "extra_info": { 54 | "split": split, 55 | "index": idx, 56 | "ability": "alignment", 57 | "prompt": dialogs[:-1], 58 | }, 59 | } 60 | return data 61 | 62 | return process_fn 63 | 64 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) 65 | local_dir = os.path.expanduser(local_dir) 66 | local_path = os.path.join(local_dir, "train.parquet") 67 | train_dataset.to_parquet(local_path) 68 | 69 | 70 | if __name__ == "__main__": 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument("--local_dir", type=str, default="~/data/full_hh_rlhf") 73 | args = parser.parse_args() 74 | 75 | generate_dataset(args.local_dir) 76 | -------------------------------------------------------------------------------- /examples/data_preprocess/math.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 POLAR Team and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Preprocess the MATH-lighteval dataset to parquet format. 16 | Ground truths in the train split are kept as raw trajectories. 17 | Ground truths in the test split are extracted from \\boxed{} 18 | """ 19 | 20 | import argparse 21 | import os 22 | 23 | import datasets 24 | 25 | 26 | def remove_boxed(s): 27 | if "\\boxed " in s: 28 | left = "\\boxed " 29 | assert s[: len(left)] == left 30 | return s[len(left):] 31 | 32 | left = "\\boxed{" 33 | 34 | assert s[: len(left)] == left 35 | assert s[-1] == "}" 36 | 37 | return s[len(left): -1] 38 | 39 | 40 | def last_boxed_only_string(string): 41 | idx = string.rfind("\\boxed") 42 | if "\\boxed " in string: 43 | return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] 44 | if idx < 0: 45 | idx = string.rfind("\\fbox") 46 | if idx < 0: 47 | return None 48 | 49 | i = idx 50 | right_brace_idx = None 51 | num_left_braces_open = 0 52 | while i < len(string): 53 | if string[i] == "{": 54 | num_left_braces_open += 1 55 | if string[i] == "}": 56 | num_left_braces_open -= 1 57 | if num_left_braces_open == 0: 58 | right_brace_idx = i 59 | break 60 | i += 1 61 | 62 | retval = None if right_brace_idx is None else string[idx: right_brace_idx + 1] 63 | 64 | return retval 65 | 66 | 67 | def extract_solution(solution_str): 68 | return remove_boxed(last_boxed_only_string(solution_str)) 69 | 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument("--local_dir", default="~/data/math") 74 | 75 | args = parser.parse_args() 76 | 77 | # 'lighteval/MATH' is no longer available on huggingface. 78 | # Use mirror repo: DigitalLearningGmbH/MATH-lighteval 79 | data_source = "DigitalLearningGmbH/MATH-lighteval" 80 | print(f"Loading the {data_source} dataset from huggingface...", flush=True) 81 | dataset = datasets.load_dataset(data_source, trust_remote_code=True) 82 | 83 | train_dataset = dataset["train"] 84 | test_dataset = dataset["test"] 85 | 86 | instruction_following = "Let's think step by step and output the final answer within \\boxed{}." 87 | 88 | # add a row to each data item that represents a unique id 89 | def make_map_fn(split): 90 | def process_fn(example, idx): 91 | example.pop("level") 92 | example.pop("type") 93 | question = example.pop("problem") 94 | 95 | question = question + " " + instruction_following 96 | 97 | answer = example.pop("solution") 98 | if split == "train": 99 | solution = answer 100 | else: 101 | solution = extract_solution(answer) 102 | 103 | data = { 104 | "data_source": data_source, 105 | "prompt": [{"role": "user", "content": question}], 106 | "ability": "math", 107 | "reward_model": {"style": "polar", "ground_truth": solution}, 108 | "extra_info": {"split": split, 109 | "index": idx, 110 | "ability": "math", 111 | "prompt": [{"role": "user", "content": question}] 112 | }, 113 | } 114 | return data 115 | 116 | return process_fn 117 | 118 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) 119 | test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) 120 | 121 | local_dir = args.local_dir 122 | 123 | train_dataset.to_parquet(os.path.join(local_dir, "train.parquet")) 124 | test_dataset.to_parquet(os.path.join(local_dir, "test.parquet")) 125 | -------------------------------------------------------------------------------- /examples/ppo/llama3-8b_general.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for llama3.1-8B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=1 7 | train_batch_size=1024 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=General 11 | policy_model_name=LLaMa3.1-8B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=meta-llama/Llama-3.1-8B-Instruct 16 | critic_path=meta-llama/Llama-3.1-8B-Instruct 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/general/train.parquet 20 | test_data_path=$HOME/data/general/train.parquet # no use 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=1024 \ 66 | data.max_response_length=1024 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | \ 89 | critic.model.path="$critic_path" \ 90 | critic.model.enable_gradient_checkpointing=True \ 91 | critic.model.use_remove_padding=True \ 92 | critic.model.fsdp_config.param_offload=False \ 93 | critic.model.fsdp_config.optimizer_offload=False \ 94 | critic.optim.lr=$critic_lr \ 95 | critic.optim.lr_warmup_steps_ratio=0 \ 96 | critic.optim.warmup_style=cosine \ 97 | critic.optim.min_lr_ratio=0.1 \ 98 | critic.use_dynamic_bsz=False \ 99 | critic.ppo_micro_batch_size_per_gpu=2 \ 100 | \ 101 | reward_model.enable=False \ 102 | reward_model.reward_manager=batch \ 103 | custom_reward_function.path=$reward_func_path \ 104 | custom_reward_function.name=compute_score_batch \ 105 | \ 106 | trainer.n_gpus_per_node=8 \ 107 | trainer.nnodes=$nodes \ 108 | trainer.critic_warmup=0 \ 109 | trainer.logger='["console","wandb"]' \ 110 | trainer.project_name='verl_ppo_general' \ 111 | trainer.val_before_train=False \ 112 | trainer.experiment_name="$name" \ 113 | trainer.save_freq=100 \ 114 | trainer.total_epochs=1 \ 115 | trainer.default_local_dir=$output_dir \ 116 | \ 117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 118 | $@ 119 | 120 | else 121 | sleep 10 122 | MASTER_ADDR=$(cat "$TARGET_FILE") 123 | 124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 126 | 127 | sleep 60 128 | while true; do 129 | status=$(ray status 2>&1) 130 | 131 | if echo "$status" | grep -q "Active:"; then 132 | echo "Active nodes found. Sleeping for 10 min..." 133 | sleep 600 134 | else 135 | echo "No active nodes found. Exiting..." 136 | exit 0 137 | fi 138 | done 139 | 140 | fi -------------------------------------------------------------------------------- /examples/ppo/llama3-8b_hh-rlhf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for llama3.1-8B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=1 7 | train_batch_size=512 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=HH-RLHF 11 | policy_model_name=LLaMa3.1-8B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=meta-llama/Llama-3.1-8B-Instruct 16 | critic_path=meta-llama/Llama-3.1-8B-Instruct 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/full_hh_rlhf/train.parquet 20 | test_data_path=$HOME/data/full_hh_rlhf/train.parquet # no use 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=128 \ 66 | data.max_response_length=512 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | \ 89 | critic.model.path="$critic_path" \ 90 | critic.model.enable_gradient_checkpointing=True \ 91 | critic.model.use_remove_padding=True \ 92 | critic.model.fsdp_config.param_offload=False \ 93 | critic.model.fsdp_config.optimizer_offload=False \ 94 | critic.optim.lr=$critic_lr \ 95 | critic.optim.lr_warmup_steps_ratio=0 \ 96 | critic.optim.warmup_style=cosine \ 97 | critic.optim.min_lr_ratio=0.1 \ 98 | critic.use_dynamic_bsz=False \ 99 | critic.ppo_micro_batch_size_per_gpu=2 \ 100 | \ 101 | reward_model.enable=False \ 102 | reward_model.reward_manager=batch \ 103 | custom_reward_function.path=$reward_func_path \ 104 | custom_reward_function.name=compute_score_batch \ 105 | \ 106 | trainer.n_gpus_per_node=8 \ 107 | trainer.nnodes=$nodes \ 108 | trainer.critic_warmup=0 \ 109 | trainer.logger='["console","wandb"]' \ 110 | trainer.project_name='verl_ppo_hh-rlhf' \ 111 | trainer.val_before_train=False \ 112 | trainer.experiment_name="$name" \ 113 | trainer.save_freq=100 \ 114 | trainer.total_epochs=5 \ 115 | trainer.default_local_dir=$output_dir \ 116 | \ 117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 118 | $@ 119 | 120 | else 121 | sleep 10 122 | MASTER_ADDR=$(cat "$TARGET_FILE") 123 | 124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 126 | 127 | sleep 60 128 | while true; do 129 | status=$(ray status 2>&1) 130 | 131 | if echo "$status" | grep -q "Active:"; then 132 | echo "Active nodes found. Sleeping for 10 min..." 133 | sleep 600 134 | else 135 | echo "No active nodes found. Exiting..." 136 | exit 0 137 | fi 138 | done 139 | 140 | fi -------------------------------------------------------------------------------- /examples/ppo/llama3-8b_math.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for llama3.1-8B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=1 7 | train_batch_size=1024 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=MATH 11 | policy_model_name=LLaMa3.1-8B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=meta-llama/Llama-3.1-8B-Instruct 16 | critic_path=meta-llama/Llama-3.1-8B-Instruct 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/math/train.parquet 20 | test_data_path=$HOME/data/math/test.parquet 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=1024 \ 66 | data.max_response_length=1024 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | \ 89 | critic.model.path="$critic_path" \ 90 | critic.model.enable_gradient_checkpointing=True \ 91 | critic.model.use_remove_padding=True \ 92 | critic.model.fsdp_config.param_offload=False \ 93 | critic.model.fsdp_config.optimizer_offload=False \ 94 | critic.optim.lr=$critic_lr \ 95 | critic.optim.lr_warmup_steps_ratio=0 \ 96 | critic.optim.warmup_style=cosine \ 97 | critic.optim.min_lr_ratio=0.1 \ 98 | critic.use_dynamic_bsz=False \ 99 | critic.ppo_micro_batch_size_per_gpu=2 \ 100 | \ 101 | reward_model.enable=False \ 102 | reward_model.reward_manager=batch \ 103 | custom_reward_function.path=$reward_func_path \ 104 | custom_reward_function.name=compute_score_batch \ 105 | \ 106 | trainer.n_gpus_per_node=8 \ 107 | trainer.nnodes=$nodes \ 108 | trainer.critic_warmup=0 \ 109 | trainer.logger='["console","wandb"]' \ 110 | trainer.project_name='verl_ppo_math' \ 111 | trainer.val_before_train=True \ 112 | trainer.experiment_name="$name" \ 113 | trainer.save_freq=100 \ 114 | trainer.test_freq=5 \ 115 | trainer.total_epochs=100 \ 116 | trainer.default_local_dir=$output_dir \ 117 | \ 118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 119 | $@ 120 | 121 | else 122 | sleep 10 123 | MASTER_ADDR=$(cat "$TARGET_FILE") 124 | 125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 127 | 128 | sleep 60 129 | while true; do 130 | status=$(ray status 2>&1) 131 | 132 | if echo "$status" | grep -q "Active:"; then 133 | echo "Active nodes found. Sleeping for 10 min..." 134 | sleep 600 135 | else 136 | echo "No active nodes found. Exiting..." 137 | exit 0 138 | fi 139 | done 140 | 141 | fi -------------------------------------------------------------------------------- /examples/ppo/qwen2_5-7b_general.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for Qwen2.5-7B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=1 7 | train_batch_size=1024 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=General 11 | policy_model_name=Qwen2.5-7B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=Qwen/Qwen2.5-7B-Instruct 16 | critic_path=Qwen/Qwen2.5-7B-Instruct 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/general/train.parquet 20 | test_data_path=$HOME/data/general/train.parquet # no use 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=1024 \ 66 | data.max_response_length=1024 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | \ 89 | critic.model.path="$critic_path" \ 90 | critic.model.enable_gradient_checkpointing=True \ 91 | critic.model.use_remove_padding=True \ 92 | critic.model.fsdp_config.param_offload=False \ 93 | critic.model.fsdp_config.optimizer_offload=False \ 94 | critic.optim.lr=$critic_lr \ 95 | critic.optim.lr_warmup_steps_ratio=0 \ 96 | critic.optim.warmup_style=cosine \ 97 | critic.optim.min_lr_ratio=0.1 \ 98 | critic.use_dynamic_bsz=False \ 99 | critic.ppo_micro_batch_size_per_gpu=2 \ 100 | \ 101 | reward_model.enable=False \ 102 | reward_model.reward_manager=batch \ 103 | custom_reward_function.path=$reward_func_path \ 104 | custom_reward_function.name=compute_score_batch \ 105 | \ 106 | trainer.n_gpus_per_node=8 \ 107 | trainer.nnodes=$nodes \ 108 | trainer.critic_warmup=0 \ 109 | trainer.logger='["console","wandb"]' \ 110 | trainer.project_name='verl_ppo_general' \ 111 | trainer.val_before_train=False \ 112 | trainer.experiment_name="$name" \ 113 | trainer.save_freq=100 \ 114 | trainer.total_epochs=1 \ 115 | trainer.default_local_dir=$output_dir \ 116 | \ 117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 118 | $@ 119 | 120 | else 121 | sleep 10 122 | MASTER_ADDR=$(cat "$TARGET_FILE") 123 | 124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 126 | 127 | sleep 60 128 | while true; do 129 | status=$(ray status 2>&1) 130 | 131 | if echo "$status" | grep -q "Active:"; then 132 | echo "Active nodes found. Sleeping for 10 min..." 133 | sleep 600 134 | else 135 | echo "No active nodes found. Exiting..." 136 | exit 0 137 | fi 138 | done 139 | 140 | fi -------------------------------------------------------------------------------- /examples/ppo/qwen2_5-7b_hh-rlhf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for Qwen2.5-7B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=1 7 | train_batch_size=512 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=HH-RLHF 11 | policy_model_name=Qwen2.5-7B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=Qwen/Qwen2.5-7B-Instruct 16 | critic_path=Qwen/Qwen2.5-7B-Instruct 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/full_hh_rlhf/train.parquet 20 | test_data_path=$HOME/data/full_hh_rlhf/train.parquet # no use 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=128 \ 66 | data.max_response_length=512 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | \ 89 | critic.model.path="$critic_path" \ 90 | critic.model.enable_gradient_checkpointing=True \ 91 | critic.model.use_remove_padding=True \ 92 | critic.model.fsdp_config.param_offload=False \ 93 | critic.model.fsdp_config.optimizer_offload=False \ 94 | critic.optim.lr=$critic_lr \ 95 | critic.optim.lr_warmup_steps_ratio=0 \ 96 | critic.optim.warmup_style=cosine \ 97 | critic.optim.min_lr_ratio=0.1 \ 98 | critic.use_dynamic_bsz=False \ 99 | critic.ppo_micro_batch_size_per_gpu=2 \ 100 | \ 101 | reward_model.enable=False \ 102 | reward_model.reward_manager=batch \ 103 | custom_reward_function.path=$reward_func_path \ 104 | custom_reward_function.name=compute_score_batch \ 105 | \ 106 | trainer.n_gpus_per_node=8 \ 107 | trainer.nnodes=$nodes \ 108 | trainer.critic_warmup=0 \ 109 | trainer.logger='["console","wandb"]' \ 110 | trainer.project_name='verl_ppo_hh-rlhf' \ 111 | trainer.val_before_train=False \ 112 | trainer.experiment_name="$name" \ 113 | trainer.save_freq=100 \ 114 | trainer.total_epochs=5 \ 115 | trainer.default_local_dir=$output_dir \ 116 | \ 117 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 118 | $@ 119 | 120 | else 121 | sleep 10 122 | MASTER_ADDR=$(cat "$TARGET_FILE") 123 | 124 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 125 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 126 | 127 | sleep 60 128 | while true; do 129 | status=$(ray status 2>&1) 130 | 131 | if echo "$status" | grep -q "Active:"; then 132 | echo "Active nodes found. Sleeping for 10 min..." 133 | sleep 600 134 | else 135 | echo "No active nodes found. Exiting..." 136 | exit 0 137 | fi 138 | done 139 | 140 | fi -------------------------------------------------------------------------------- /examples/ppo/qwen2_5-7b_math.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for Qwen2.5-7B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=1 7 | train_batch_size=1024 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=MATH 11 | policy_model_name=Qwen2.5-7B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=Qwen/Qwen2.5-7B-Instruct 16 | critic_path=Qwen/Qwen2.5-7B-Instruct 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/math/train.parquet 20 | test_data_path=$HOME/data/math/test.parquet 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=1024 \ 66 | data.max_response_length=1024 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | \ 89 | critic.model.path="$critic_path" \ 90 | critic.model.enable_gradient_checkpointing=True \ 91 | critic.model.use_remove_padding=True \ 92 | critic.model.fsdp_config.param_offload=False \ 93 | critic.model.fsdp_config.optimizer_offload=False \ 94 | critic.optim.lr=$critic_lr \ 95 | critic.optim.lr_warmup_steps_ratio=0 \ 96 | critic.optim.warmup_style=cosine \ 97 | critic.optim.min_lr_ratio=0.1 \ 98 | critic.use_dynamic_bsz=False \ 99 | critic.ppo_micro_batch_size_per_gpu=2 \ 100 | \ 101 | reward_model.enable=False \ 102 | reward_model.reward_manager=batch \ 103 | custom_reward_function.path=$reward_func_path \ 104 | custom_reward_function.name=compute_score_batch \ 105 | \ 106 | trainer.n_gpus_per_node=8 \ 107 | trainer.nnodes=$nodes \ 108 | trainer.critic_warmup=0 \ 109 | trainer.logger='["console","wandb"]' \ 110 | trainer.project_name='verl_ppo_math' \ 111 | trainer.val_before_train=True \ 112 | trainer.experiment_name="$name" \ 113 | trainer.save_freq=100 \ 114 | trainer.test_freq=5 \ 115 | trainer.total_epochs=100 \ 116 | trainer.default_local_dir=$output_dir \ 117 | \ 118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 119 | $@ 120 | 121 | else 122 | sleep 10 123 | MASTER_ADDR=$(cat "$TARGET_FILE") 124 | 125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 127 | 128 | sleep 60 129 | while true; do 130 | status=$(ray status 2>&1) 131 | 132 | if echo "$status" | grep -q "Active:"; then 133 | echo "Active nodes found. Sleeping for 10 min..." 134 | sleep 600 135 | else 136 | echo "No active nodes found. Exiting..." 137 | exit 0 138 | fi 139 | done 140 | 141 | fi -------------------------------------------------------------------------------- /examples/ppo/qwen3-8b_general.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for Qwen3-8B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=4 7 | train_batch_size=1024 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=General 11 | policy_model_name=Qwen3-8B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=Qwen/Qwen3-8B 16 | critic_path=Qwen/Qwen3-8B 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/general/train.parquet 20 | test_data_path=$HOME/data/general/train.parquet # no use 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=1024 \ 66 | data.max_response_length=15000 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ 89 | \ 90 | critic.model.path="$critic_path" \ 91 | critic.model.enable_gradient_checkpointing=True \ 92 | critic.model.use_remove_padding=True \ 93 | critic.model.fsdp_config.param_offload=False \ 94 | critic.model.fsdp_config.optimizer_offload=False \ 95 | critic.optim.lr=$critic_lr \ 96 | critic.optim.lr_warmup_steps_ratio=0 \ 97 | critic.optim.warmup_style=cosine \ 98 | critic.optim.min_lr_ratio=0.1 \ 99 | critic.use_dynamic_bsz=False \ 100 | critic.ppo_micro_batch_size_per_gpu=1 \ 101 | \ 102 | reward_model.enable=False \ 103 | reward_model.reward_manager=batch \ 104 | custom_reward_function.path=$reward_func_path \ 105 | custom_reward_function.name=compute_score_batch \ 106 | \ 107 | trainer.n_gpus_per_node=8 \ 108 | trainer.nnodes=$nodes \ 109 | trainer.critic_warmup=0 \ 110 | trainer.logger='["console","wandb"]' \ 111 | trainer.project_name='verl_ppo_general' \ 112 | trainer.val_before_train=False \ 113 | trainer.experiment_name="$name" \ 114 | trainer.save_freq=100 \ 115 | trainer.total_epochs=1 \ 116 | trainer.default_local_dir=$output_dir \ 117 | \ 118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 119 | $@ 120 | 121 | else 122 | sleep 10 123 | MASTER_ADDR=$(cat "$TARGET_FILE") 124 | 125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 127 | 128 | sleep 60 129 | while true; do 130 | status=$(ray status 2>&1) 131 | 132 | if echo "$status" | grep -q "Active:"; then 133 | echo "Active nodes found. Sleeping for 10 min..." 134 | sleep 600 135 | else 136 | echo "No active nodes found. Exiting..." 137 | exit 0 138 | fi 139 | done 140 | 141 | fi -------------------------------------------------------------------------------- /examples/ppo/qwen3-8b_hh-rlhf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for Qwen3-8B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=2 7 | train_batch_size=512 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=HH-RLHF 11 | policy_model_name=Qwen3-8B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=Qwen/Qwen3-8B 16 | critic_path=Qwen/Qwen3-8B 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/full_hh_rlhf/train.parquet 20 | test_data_path=$HOME/data/full_hh_rlhf/train.parquet # no use 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=128 \ 66 | data.max_response_length=16000 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ 89 | \ 90 | critic.model.path="$critic_path" \ 91 | critic.model.enable_gradient_checkpointing=True \ 92 | critic.model.use_remove_padding=True \ 93 | critic.model.fsdp_config.param_offload=False \ 94 | critic.model.fsdp_config.optimizer_offload=False \ 95 | critic.optim.lr=$critic_lr \ 96 | critic.optim.lr_warmup_steps_ratio=0 \ 97 | critic.optim.warmup_style=cosine \ 98 | critic.optim.min_lr_ratio=0.1 \ 99 | critic.use_dynamic_bsz=False \ 100 | critic.ppo_micro_batch_size_per_gpu=1 \ 101 | \ 102 | reward_model.enable=False \ 103 | reward_model.reward_manager=batch \ 104 | custom_reward_function.path=$reward_func_path \ 105 | custom_reward_function.name=compute_score_batch \ 106 | \ 107 | trainer.n_gpus_per_node=8 \ 108 | trainer.nnodes=$nodes \ 109 | trainer.critic_warmup=0 \ 110 | trainer.logger='["console","wandb"]' \ 111 | trainer.project_name='verl_ppo_hh-rlhf' \ 112 | trainer.val_before_train=False \ 113 | trainer.experiment_name="$name" \ 114 | trainer.save_freq=100 \ 115 | trainer.total_epochs=5 \ 116 | trainer.default_local_dir=$output_dir \ 117 | \ 118 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 119 | $@ 120 | 121 | else 122 | sleep 10 123 | MASTER_ADDR=$(cat "$TARGET_FILE") 124 | 125 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 126 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 127 | 128 | sleep 60 129 | while true; do 130 | status=$(ray status 2>&1) 131 | 132 | if echo "$status" | grep -q "Active:"; then 133 | echo "Active nodes found. Sleeping for 10 min..." 134 | sleep 600 135 | else 136 | echo "No active nodes found. Exiting..." 137 | exit 0 138 | fi 139 | done 140 | 141 | fi -------------------------------------------------------------------------------- /examples/ppo/qwen3-8b_math.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Verl PPO training script for Qwen3-8B 3 | set -x 4 | 5 | # Parameters from original script 6 | nodes=4 7 | train_batch_size=1024 8 | actor_lr=1e-6 9 | critic_lr=1e-5 10 | data_name=MATH 11 | policy_model_name=Qwen3-8B-Instruct 12 | reward_model_name=POLAR-7B 13 | 14 | # Model paths 15 | actor_path=Qwen/Qwen3-8B 16 | critic_path=Qwen/Qwen3-8B 17 | 18 | # Data paths 19 | train_data_path=$HOME/data/math/train.parquet 20 | test_data_path=$HOME/data/math/test.parquet 21 | 22 | # Reward Configuration 23 | reward_func_path="../src/polar/reward_func.py" 24 | 25 | # Experiment name 26 | name="verl_ppo_policy_${policy_model_name}_reward_${reward_model_name}_data_${data_name}" 27 | output_dir="../outputs/${name}" 28 | 29 | # Create output directory if it doesn't exist 30 | mkdir -p $output_dir 31 | 32 | # Set wandb to offline mode to prevent online sync 33 | # export WANDB_MODE=offline 34 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False 35 | 36 | TARGET_FILE="$output_dir/addr_${name}.txt" 37 | RANK=${RANK:-${NODE_RANK:-0}} 38 | MASTER_PORT=6379 39 | MASTER_ADDR=${MASTER_ADDR} 40 | echo "MASTER_ADDR: $MASTER_ADDR" 41 | echo "Rank $RANK is running on $MASTER_ADDR" 42 | 43 | if [ "$RANK" -eq 0 ]; then 44 | echo "Starting head node (RANK=${RANK}) on port $MASTER_PORT..." 45 | 46 | MASTER_ADDR=${MASTER_ADDR} 47 | echo "$MASTER_ADDR" > "$TARGET_FILE" 48 | 49 | ray start --head --num-gpus 8 --dashboard-host=0.0.0.0 --dashboard-port=8265 --disable-usage-stats --block & 50 | sleep 30 51 | 52 | echo "Executing main program on head node..." 53 | 54 | python3 -m verl.trainer.main_ppo \ 55 | algorithm.adv_estimator=gae \ 56 | algorithm.gamma=1.0 \ 57 | algorithm.lam=1.0 \ 58 | algorithm.use_kl_in_reward=False \ 59 | algorithm.kl_ctrl.kl_coef=0 \ 60 | algorithm.kl_ctrl.type='adaptive' \ 61 | \ 62 | data.train_files="$train_data_path" \ 63 | data.val_files="$test_data_path" \ 64 | data.train_batch_size=$train_batch_size \ 65 | data.max_prompt_length=1024 \ 66 | data.max_response_length=15000 \ 67 | data.filter_overlong_prompts=True \ 68 | data.truncation='error' \ 69 | data.prompt_key='prompt' \ 70 | \ 71 | actor_rollout_ref.model.path="$actor_path" \ 72 | actor_rollout_ref.model.enable_gradient_checkpointing=True \ 73 | actor_rollout_ref.model.use_remove_padding=True \ 74 | actor_rollout_ref.model.use_shm=False \ 75 | \ 76 | actor_rollout_ref.actor.optim.lr=$actor_lr \ 77 | actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03 \ 78 | actor_rollout_ref.actor.ppo_mini_batch_size=$train_batch_size \ 79 | actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ 80 | actor_rollout_ref.actor.clip_ratio=0.2 \ 81 | actor_rollout_ref.actor.use_kl_loss=False \ 82 | \ 83 | actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ 84 | actor_rollout_ref.rollout.n=1 \ 85 | actor_rollout_ref.rollout.name=vllm \ 86 | actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ 87 | actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ 88 | actor_rollout_ref.rollout.max_num_batched_tokens=16384 \ 89 | \ 90 | critic.model.path="$critic_path" \ 91 | critic.model.enable_gradient_checkpointing=True \ 92 | critic.model.use_remove_padding=True \ 93 | critic.model.fsdp_config.param_offload=False \ 94 | critic.model.fsdp_config.optimizer_offload=False \ 95 | critic.optim.lr=$critic_lr \ 96 | critic.optim.lr_warmup_steps_ratio=0 \ 97 | critic.optim.warmup_style=cosine \ 98 | critic.optim.min_lr_ratio=0.1 \ 99 | critic.use_dynamic_bsz=False \ 100 | critic.ppo_micro_batch_size_per_gpu=1 \ 101 | \ 102 | reward_model.enable=False \ 103 | reward_model.reward_manager=batch \ 104 | custom_reward_function.path=$reward_func_path \ 105 | custom_reward_function.name=compute_score_batch \ 106 | \ 107 | trainer.n_gpus_per_node=8 \ 108 | trainer.nnodes=$nodes \ 109 | trainer.critic_warmup=0 \ 110 | trainer.logger='["console","wandb"]' \ 111 | trainer.project_name='verl_ppo_math' \ 112 | trainer.val_before_train=True \ 113 | trainer.experiment_name="$name" \ 114 | trainer.save_freq=100 \ 115 | trainer.test_freq=5 \ 116 | trainer.total_epochs=100 \ 117 | trainer.default_local_dir=$output_dir \ 118 | \ 119 | trainer.rollout_data_dir="${output_dir}/trajectory_data/rollout" \ 120 | $@ 121 | 122 | else 123 | sleep 10 124 | MASTER_ADDR=$(cat "$TARGET_FILE") 125 | 126 | echo "Starting worker node (RANK=${RANK}), connecting to ${MASTER_ADDR}:${MASTER_PORT}..." 127 | ray start --address ${MASTER_ADDR}:${MASTER_PORT} --num-gpus 8 --block & 128 | 129 | sleep 60 130 | while true; do 131 | status=$(ray status 2>&1) 132 | 133 | if echo "$status" | grep -q "Active:"; then 134 | echo "Active nodes found. Sleeping for 10 min..." 135 | sleep 600 136 | else 137 | echo "No active nodes found. Exiting..." 138 | exit 0 139 | fi 140 | done 141 | 142 | fi -------------------------------------------------------------------------------- /examples/xtuner_configs/POLAR_1_8B_full_varlenattn_custom_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.dataset import DefaultSampler 3 | from mmengine.hooks import ( 4 | CheckpointHook, 5 | DistSamplerSeedHook, 6 | IterTimerHook, 7 | LoggerHook, 8 | ParamSchedulerHook, 9 | ) 10 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 11 | from mmengine.visualization import Visualizer, TensorboardVisBackend 12 | from torch.optim import AdamW 13 | from transformers import AutoModel, AutoTokenizer 14 | 15 | from datasets import load_dataset 16 | from xtuner.dataset.collate_fns.preference_collate_fn import preference_collate_fn 17 | from xtuner.dataset.preference_dataset import build_preference_dataset 18 | from xtuner.engine.hooks import VarlenAttnArgsToMessageHubHook 19 | from xtuner.engine.runner import TrainLoop 20 | from xtuner.model.reward import RewardModel 21 | from xtuner.parallel.sequence import SequenceParallelSampler 22 | 23 | ####################################################################### 24 | # PART 1 Settings # 25 | ####################################################################### 26 | # Model 27 | pretrained_model_name_or_path = "internlm/POLAR-1_8B-Base" 28 | use_varlen_attn = True 29 | reward_token_id = 92527 # use [UNUSED_TOKEN_130] as reward token 30 | loss_type = "ranking" 31 | penalty_type = "none" 32 | 33 | # Data 34 | max_length = 16384 35 | max_response_length = 4096 36 | max_packed_length = max_length * 2 37 | 38 | # parallel 39 | sequence_parallel_size = 1 40 | 41 | # Scheduler & Optimizer 42 | batch_size = 1 # per_device 43 | accumulative_counts = 2 44 | accumulative_counts *= sequence_parallel_size 45 | dataloader_num_workers = 0 46 | max_epochs = 1 # reward model should not be trained for more than 1 epoch to avoid overfitting # noqa: E501 47 | optim_type = AdamW 48 | lr = 1e-5 49 | betas = (0.9, 0.95) 50 | weight_decay = 0 51 | max_norm = 1 # grad clip 52 | warmup_ratio = 0.03 53 | 54 | # Save 55 | save_steps = 500 56 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 57 | 58 | # Evaluate the generation performance during the training 59 | # TODO: eval 60 | # evaluation_freq = 500 61 | 62 | ####################################################################### 63 | # PART 2 Model & Tokenizer # 64 | ####################################################################### 65 | tokenizer = dict( 66 | type=AutoTokenizer.from_pretrained, 67 | pretrained_model_name_or_path=pretrained_model_name_or_path, 68 | trust_remote_code=True, 69 | padding_side="left", 70 | ) 71 | 72 | model = dict( 73 | type=RewardModel, 74 | use_varlen_attn=use_varlen_attn, 75 | loss_type=loss_type, 76 | penalty_type=penalty_type, 77 | llm=dict( 78 | type=AutoModel.from_pretrained, 79 | pretrained_model_name_or_path=pretrained_model_name_or_path, 80 | trust_remote_code=True, 81 | ), 82 | ) 83 | 84 | ####################################################################### 85 | # PART 3 Dataset & Dataloader # 86 | ####################################################################### 87 | sampler = SequenceParallelSampler if sequence_parallel_size > 1 else DefaultSampler 88 | 89 | # preference data format example: 90 | # { 91 | # "prompt": [{"role": "user", "content": "What is the capital of France?"}], 92 | # "reference": [{"role": "assistant", "content": "The capital of France is Paris."}], 93 | # "chosen": [{"role": "assistant", "content": "Paris."}], 94 | # "rejected": [{"role": "assistant", "content": "I don't know."}], 95 | # } 96 | 97 | train_dataset = dict( 98 | type=build_preference_dataset, 99 | dataset=dict( 100 | type=load_dataset, 101 | # Replace with your custom dataset path 102 | # For example, if you have a local /path/to/file/train.jsonl, you can use: 103 | # path="/path/to/file", 104 | path="/your/custom/path/here", 105 | ), 106 | tokenizer=tokenizer, 107 | max_length=max_length, 108 | dataset_map_fn=None, 109 | is_dpo=False, 110 | is_reward=True, 111 | reward_token_id=reward_token_id, 112 | num_proc=32, 113 | use_varlen_attn=use_varlen_attn, 114 | max_packed_length=max_packed_length, 115 | shuffle_before_pack=True, 116 | max_response_length=max_response_length, 117 | is_reference=True 118 | ) 119 | 120 | train_dataloader = dict( 121 | batch_size=batch_size, 122 | num_workers=dataloader_num_workers, 123 | dataset=train_dataset, 124 | sampler=dict(type=sampler, shuffle=True), 125 | collate_fn=dict(type=preference_collate_fn, use_varlen_attn=use_varlen_attn), 126 | ) 127 | 128 | ####################################################################### 129 | # PART 4 Scheduler & Optimizer # 130 | ####################################################################### 131 | # optimizer 132 | optim_wrapper = dict( 133 | type=AmpOptimWrapper, 134 | optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 135 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 136 | accumulative_counts=accumulative_counts, 137 | loss_scale="dynamic", 138 | dtype="float16", 139 | ) 140 | 141 | # learning policy 142 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 143 | param_scheduler = [ 144 | dict( 145 | type=LinearLR, 146 | start_factor=lr * 0.1, 147 | by_epoch=True, 148 | begin=0, 149 | end=warmup_ratio * max_epochs, 150 | convert_to_iter_based=True, 151 | ), 152 | dict( 153 | type=CosineAnnealingLR, 154 | eta_min=lr * 0.1, 155 | by_epoch=True, 156 | begin=warmup_ratio * max_epochs, 157 | end=max_epochs, 158 | convert_to_iter_based=True, 159 | ), 160 | ] 161 | 162 | # train, val, test setting 163 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 164 | 165 | ####################################################################### 166 | # PART 5 Runtime # 167 | ####################################################################### 168 | # Log the dialogue periodically during the training process, optional 169 | custom_hooks = [] 170 | 171 | if use_varlen_attn: 172 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 173 | 174 | # configure default hooks 175 | default_hooks = dict( 176 | # record the time of every iteration. 177 | timer=dict(type=IterTimerHook), 178 | # print log every 10 iterations. 179 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 180 | # enable the parameter scheduler. 181 | param_scheduler=dict(type=ParamSchedulerHook), 182 | # save checkpoint per `save_steps`. 183 | checkpoint=dict( 184 | type=CheckpointHook, 185 | by_epoch=False, 186 | interval=save_steps, 187 | max_keep_ckpts=save_total_limit, 188 | ), 189 | # set sampler seed in distributed evrionment. 190 | sampler_seed=dict(type=DistSamplerSeedHook), 191 | ) 192 | 193 | # configure environment 194 | env_cfg = dict( 195 | # whether to enable cudnn benchmark 196 | cudnn_benchmark=False, 197 | # set multi process parameters 198 | mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), 199 | # set distributed parameters 200 | dist_cfg=dict(backend="nccl"), 201 | ) 202 | 203 | # set visualizer 204 | visualizer = dict( 205 | type=Visualizer, 206 | vis_backends=[dict(type=TensorboardVisBackend)] 207 | ) 208 | 209 | # set log level 210 | log_level = "INFO" 211 | 212 | # load from which checkpoint 213 | load_from = None 214 | 215 | # whether to resume training from the loaded checkpoint 216 | resume = False 217 | 218 | # Defaults to use random seed and disable `deterministic` 219 | randomness = dict(seed=None, deterministic=False) 220 | 221 | # set log processor 222 | log_processor = dict(by_epoch=False) 223 | -------------------------------------------------------------------------------- /examples/xtuner_configs/POLAR_7B_full_varlenattn_custom_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.dataset import DefaultSampler 3 | from mmengine.hooks import ( 4 | CheckpointHook, 5 | DistSamplerSeedHook, 6 | IterTimerHook, 7 | LoggerHook, 8 | ParamSchedulerHook, 9 | ) 10 | from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR 11 | from mmengine.visualization import Visualizer, TensorboardVisBackend 12 | from torch.optim import AdamW 13 | from transformers import AutoModel, AutoTokenizer 14 | 15 | from datasets import load_dataset 16 | from xtuner.dataset.collate_fns.preference_collate_fn import preference_collate_fn 17 | from xtuner.dataset.preference_dataset import build_preference_dataset 18 | from xtuner.engine.hooks import VarlenAttnArgsToMessageHubHook 19 | from xtuner.engine.runner import TrainLoop 20 | from xtuner.model.reward import RewardModel 21 | from xtuner.parallel.sequence import SequenceParallelSampler 22 | 23 | ####################################################################### 24 | # PART 1 Settings # 25 | ####################################################################### 26 | # Model 27 | pretrained_model_name_or_path = "internlm/POLAR-7B-Base" 28 | use_varlen_attn = True 29 | reward_token_id = 92527 # use [UNUSED_TOKEN_130] as reward token 30 | loss_type = "ranking" 31 | penalty_type = "none" 32 | 33 | # Data 34 | max_length = 16384 35 | max_response_length = 4096 36 | max_packed_length = max_length * 2 37 | 38 | # parallel 39 | sequence_parallel_size = 1 40 | 41 | # Scheduler & Optimizer 42 | batch_size = 1 # per_device 43 | accumulative_counts = 2 44 | accumulative_counts *= sequence_parallel_size 45 | dataloader_num_workers = 0 46 | max_epochs = 1 # reward model should not be trained for more than 1 epoch to avoid overfitting # noqa: E501 47 | optim_type = AdamW 48 | lr = 2e-5 49 | betas = (0.9, 0.95) 50 | weight_decay = 0 51 | max_norm = 1 # grad clip 52 | warmup_ratio = 0.03 53 | 54 | # Save 55 | save_steps = 500 56 | save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) 57 | 58 | # Evaluate the generation performance during the training 59 | # TODO: eval 60 | # evaluation_freq = 500 61 | 62 | ####################################################################### 63 | # PART 2 Model & Tokenizer # 64 | ####################################################################### 65 | tokenizer = dict( 66 | type=AutoTokenizer.from_pretrained, 67 | pretrained_model_name_or_path=pretrained_model_name_or_path, 68 | trust_remote_code=True, 69 | padding_side="left", 70 | ) 71 | 72 | model = dict( 73 | type=RewardModel, 74 | use_varlen_attn=use_varlen_attn, 75 | loss_type=loss_type, 76 | penalty_type=penalty_type, 77 | llm=dict( 78 | type=AutoModel.from_pretrained, 79 | pretrained_model_name_or_path=pretrained_model_name_or_path, 80 | trust_remote_code=True, 81 | ), 82 | ) 83 | 84 | ####################################################################### 85 | # PART 3 Dataset & Dataloader # 86 | ####################################################################### 87 | sampler = SequenceParallelSampler if sequence_parallel_size > 1 else DefaultSampler 88 | 89 | # preference data format example: 90 | # { 91 | # "prompt": [{"role": "user", "content": "What is the capital of France?"}], 92 | # "reference": [{"role": "assistant", "content": "The capital of France is Paris."}], 93 | # "chosen": [{"role": "assistant", "content": "Paris."}], 94 | # "rejected": [{"role": "assistant", "content": "I don't know."}], 95 | # } 96 | 97 | train_dataset = dict( 98 | type=build_preference_dataset, 99 | dataset=dict( 100 | type=load_dataset, 101 | # Replace with your custom dataset path 102 | # For example, if you have a local /path/to/file/train.jsonl, you can use: 103 | # path="/path/to/file", 104 | path="/your/custom/path/here", 105 | ), 106 | tokenizer=tokenizer, 107 | max_length=max_length, 108 | dataset_map_fn=None, 109 | is_dpo=False, 110 | is_reward=True, 111 | reward_token_id=reward_token_id, 112 | num_proc=32, 113 | use_varlen_attn=use_varlen_attn, 114 | max_packed_length=max_packed_length, 115 | shuffle_before_pack=True, 116 | max_response_length=max_response_length, 117 | is_reference=True 118 | ) 119 | 120 | train_dataloader = dict( 121 | batch_size=batch_size, 122 | num_workers=dataloader_num_workers, 123 | dataset=train_dataset, 124 | sampler=dict(type=sampler, shuffle=True), 125 | collate_fn=dict(type=preference_collate_fn, use_varlen_attn=use_varlen_attn), 126 | ) 127 | 128 | ####################################################################### 129 | # PART 4 Scheduler & Optimizer # 130 | ####################################################################### 131 | # optimizer 132 | optim_wrapper = dict( 133 | type=AmpOptimWrapper, 134 | optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), 135 | clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), 136 | accumulative_counts=accumulative_counts, 137 | loss_scale="dynamic", 138 | dtype="float16", 139 | ) 140 | 141 | # learning policy 142 | # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 143 | param_scheduler = [ 144 | dict( 145 | type=LinearLR, 146 | start_factor=lr * 0.1, 147 | by_epoch=True, 148 | begin=0, 149 | end=warmup_ratio * max_epochs, 150 | convert_to_iter_based=True, 151 | ), 152 | dict( 153 | type=CosineAnnealingLR, 154 | eta_min=lr * 0.1, 155 | by_epoch=True, 156 | begin=warmup_ratio * max_epochs, 157 | end=max_epochs, 158 | convert_to_iter_based=True, 159 | ), 160 | ] 161 | 162 | # train, val, test setting 163 | train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) 164 | 165 | ####################################################################### 166 | # PART 5 Runtime # 167 | ####################################################################### 168 | # Log the dialogue periodically during the training process, optional 169 | custom_hooks = [] 170 | 171 | if use_varlen_attn: 172 | custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] 173 | 174 | # configure default hooks 175 | default_hooks = dict( 176 | # record the time of every iteration. 177 | timer=dict(type=IterTimerHook), 178 | # print log every 10 iterations. 179 | logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), 180 | # enable the parameter scheduler. 181 | param_scheduler=dict(type=ParamSchedulerHook), 182 | # save checkpoint per `save_steps`. 183 | checkpoint=dict( 184 | type=CheckpointHook, 185 | by_epoch=False, 186 | interval=save_steps, 187 | max_keep_ckpts=save_total_limit, 188 | ), 189 | # set sampler seed in distributed evrionment. 190 | sampler_seed=dict(type=DistSamplerSeedHook), 191 | ) 192 | 193 | # configure environment 194 | env_cfg = dict( 195 | # whether to enable cudnn benchmark 196 | cudnn_benchmark=False, 197 | # set multi process parameters 198 | mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), 199 | # set distributed parameters 200 | dist_cfg=dict(backend="nccl"), 201 | ) 202 | 203 | # set visualizer 204 | visualizer = dict( 205 | type=Visualizer, 206 | vis_backends=[dict(type=TensorboardVisBackend)] 207 | ) 208 | 209 | # set log level 210 | log_level = "INFO" 211 | 212 | # load from which checkpoint 213 | load_from = None 214 | 215 | # whether to resume training from the loaded checkpoint 216 | resume = False 217 | 218 | # Defaults to use random seed and disable `deterministic` 219 | randomness = dict(seed=None, deterministic=False) 220 | 221 | # set log processor 222 | log_processor = dict(by_epoch=False) 223 | -------------------------------------------------------------------------------- /src/polar/__init__.py: -------------------------------------------------------------------------------- 1 | from .reward_func import RewardModelClient 2 | -------------------------------------------------------------------------------- /src/polar/reward_func.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 POLAR Team and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Union 16 | from time import sleep 17 | import requests 18 | import re 19 | from transformers import AutoTokenizer 20 | 21 | # Config reward model server 22 | ADDRESS = "127.0.0.1:30000" # Modify according to your server address 23 | SERVER_TYPE = "sglang" # Options: "sglang", "vllm", "lmdeploy" 24 | MODEL_PATH = "internlm/POLAR-7B" 25 | 26 | 27 | class RewardModelClient: 28 | """This class is used to process the input sequences for the reward 29 | model.""" 30 | 31 | def __init__( 32 | self, 33 | path, 34 | max_length=16384, 35 | max_response_length=4096, 36 | response_cut_side="right", 37 | server_type="sglang", 38 | server_address="127.0.0.1:30000", 39 | ): 40 | """ 41 | Args: 42 | path: Path to the reward model. 43 | max_length: Maximum length of the input sequence. 44 | max_response_length: Maximum length of the response sequence. 45 | response_cut_side: Side to cut the response sequence if it exceeds the maximum length. 46 | server_type: Type of the server, can be "sglang", "vllm", or "lmdeploy". 47 | server_address: Address of the reword model server. 48 | """ 49 | self.rm_name = path.split("/")[-1] 50 | self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 51 | # for final reward token and one <|reward|> token and two '\n' tokens 52 | self.max_length = max_length - 4 53 | self.max_response_length = max_response_length 54 | self.response_cut_side = response_cut_side 55 | self.server_type = server_type 56 | self.server_address = server_address 57 | 58 | def _encode(self, prompt, reference, output, wrapper="sft") -> str: 59 | """Construct the input string for the reward model. 60 | 61 | Args: 62 | prompt: Prompt. 63 | reference: Reference trajectory. 64 | output: Candidate trajectory. 65 | wrapper: The wrapper type. Can be "sft" or "pretrain". 66 | Returns: 67 | The constructed input string for RM. 68 | """ 69 | p = ( 70 | "\n".join([e["content"] for e in prompt]) 71 | if isinstance(prompt, list) 72 | else prompt 73 | ) 74 | r1 = ( 75 | "\n".join([e["content"] for e in reference]) 76 | if isinstance(reference, list) 77 | else reference 78 | ) 79 | r2 = ( 80 | "\n".join([e["content"] for e in output]) 81 | if isinstance(output, list) 82 | else output 83 | ) 84 | 85 | p_ids = self.tokenizer.encode(p, add_special_tokens=True) 86 | r1_ids = self.tokenizer.encode(r1, add_special_tokens=True) 87 | r2_ids = self.tokenizer.encode(r2, add_special_tokens=True) 88 | 89 | if len(r1_ids) > self.max_response_length: 90 | print( 91 | f"Reference sequence length {len(r1_ids)} is " 92 | f"larger than max_response_length {self.max_response_length}", 93 | ) 94 | if self.response_cut_side == "right": 95 | r1_ids = r1_ids[: self.max_response_length] 96 | else: 97 | r1_ids = r1_ids[-self.max_response_length:] 98 | if len(r2_ids) > self.max_response_length: 99 | print( 100 | f"Output sequence length {len(r2_ids)} is " 101 | f"larger than max_response_length {self.max_response_length}", 102 | ) 103 | if self.response_cut_side == "right": 104 | r2_ids = r2_ids[: self.max_response_length] 105 | else: 106 | r2_ids = r2_ids[-self.max_response_length:] 107 | 108 | max_prompt_length = (self.max_length - len(r1_ids) - len(r2_ids)) // 2 109 | 110 | if len(p_ids) > max_prompt_length: 111 | print( 112 | f"Prompt sequence length {len(p_ids)} is " 113 | f"larger than max_prompt_length {max_prompt_length}", 114 | ) 115 | p_ids = p_ids[-max_prompt_length:] 116 | 117 | p = self.tokenizer.decode(p_ids, skip_special_tokens=True) 118 | r1 = self.tokenizer.decode(r1_ids, skip_special_tokens=True) 119 | r2 = self.tokenizer.decode(r2_ids, skip_special_tokens=True) 120 | 121 | # Fit the template of RM 122 | _reference_cat = ( 123 | p + r1 if wrapper == "pretrain" or len(r1) == "" else p + "\n" + r1 124 | ) 125 | _output_cat = ( 126 | p + r2 if wrapper == "pretrain" or len(r2) == "" else p + "\n" + r2 127 | ) 128 | 129 | final_txt = _reference_cat + "<|reward|>" + _output_cat + "[UNUSED_TOKEN_130]" 130 | 131 | return final_txt 132 | 133 | def encode(self, data) -> Union[str, List[str]]: 134 | """Encode the input data into a format suitable for RM. 135 | 136 | Args: 137 | data: A dictionary or a list of dictionary containing the keys 138 | 'prompt', 'reference', 'output', and optionally 'wrapper'. 139 | Returns: 140 | The encoded input string for RM. 141 | """ 142 | if isinstance(data, dict): 143 | return self._encode(**data) 144 | elif isinstance(data, list): 145 | return [ 146 | self._encode(**item) if isinstance(item, dict) else item 147 | for item in data 148 | ] 149 | else: 150 | raise ValueError( 151 | "Input data must be a dictionary or a list of dictionaries." 152 | ) 153 | 154 | def sglang_request_reward( 155 | self, data, retry_delay=0.2, max_retries=8 156 | ) -> List[float]: 157 | # Disable proxy for internal cluster communication 158 | for i in range(max_retries): 159 | try: 160 | res = requests.post( 161 | f"http://{self.server_address}/classify", 162 | json={ 163 | "model": self.rm_name, 164 | "text": data, 165 | }, 166 | proxies={"http": None, "https": None}, # Explicitly disable proxy 167 | timeout=30, # Add timeout 168 | ) 169 | rewards = [e["embedding"][0] for e in res.json()] 170 | return rewards 171 | except Exception as e: 172 | print(f"Error requesting reward: {e}") 173 | print(f"Raw response: {data}") 174 | sleep(retry_delay) 175 | continue 176 | print(f"Failed to request reward after {max_retries} retries") 177 | return None 178 | 179 | def vllm_request_reward(self, data, retry_delay=0.2, max_retries=8) -> List[float]: 180 | # Disable proxy for internal cluster communication 181 | for i in range(max_retries): 182 | try: 183 | res = requests.post( 184 | f"http://{self.server_address}/pooling", 185 | json={ 186 | "input": data, 187 | }, 188 | proxies={"http": None, "https": None}, # Explicitly disable proxy 189 | timeout=30, # Add timeout 190 | ) 191 | rewards = [e["data"][-1][0] for e in res.json()["data"]] 192 | return rewards 193 | except Exception as e: 194 | print(f"Error requesting reward: {e}") 195 | print(f"Raw response: {data}") 196 | sleep(retry_delay) 197 | continue 198 | print(f"Failed to request reward after {max_retries} retries") 199 | return None 200 | 201 | def lmdeploy_request_reward( 202 | self, data, retry_delay=0.2, max_retries=8 203 | ) -> List[float]: 204 | # Disable proxy for internal cluster communication 205 | for i in range(max_retries): 206 | try: 207 | res = requests.post( 208 | f"http://{self.server_address}/pooling", 209 | json={ 210 | "input": data, 211 | }, 212 | proxies={"http": None, "https": None}, # Explicitly disable proxy 213 | timeout=30, # Add timeout 214 | ) 215 | rewards = [e["data"] for e in res.json()["data"]] 216 | return rewards 217 | except Exception as e: 218 | print(f"Error requesting reward: {e}") 219 | print(f"Raw response: {data}") 220 | sleep(retry_delay) 221 | continue 222 | print(f"Failed to request reward after {max_retries} retries") 223 | return None 224 | 225 | def __call__(self, data) -> List[float]: 226 | """Call the input wrapper to construct the input string for RM. 227 | 228 | Args: 229 | data: A list of dictionaries containing the keys 230 | 'prompt', 'reference', 'output', and optionally 'wrapper'. 231 | retry_delay: Delay in seconds before retrying the request. 232 | max_retries: Maximum number of retries for the request. 233 | Returns: 234 | scores: The list of reward scores returned by the RM server. 235 | If the request fails, it returns None. 236 | """ 237 | data = self.encode(data) 238 | if self.server_type == "sglang": 239 | scores = self.sglang_request_reward(data) 240 | elif self.server_type == "vllm": 241 | scores = self.vllm_request_reward(data) 242 | elif self.server_type == "lmdeploy": 243 | scores = self.lmdeploy_request_reward(data) 244 | else: 245 | raise ValueError(f"Unsupported server type: {self.server_type}") 246 | 247 | return scores 248 | 249 | 250 | # Global variable to hold the RewardModelClient instance 251 | _reward_client = None 252 | 253 | 254 | def get_reward_client(): 255 | """Get or create a RewardModelClient instance.""" 256 | global _reward_client 257 | if _reward_client is None: 258 | 259 | _reward_client = RewardModelClient( 260 | path=MODEL_PATH, 261 | server_type=SERVER_TYPE, 262 | server_address=ADDRESS, 263 | ) 264 | 265 | return _reward_client 266 | 267 | 268 | def extract_thinking_content(text: str) -> tuple[str, str]: 269 | pattern = r'(.*?)(.*)' 270 | match = re.search(pattern, text, re.DOTALL) 271 | if match: 272 | thinking_content = match.group(1).strip() 273 | remaining_content = match.group(2).strip() 274 | return thinking_content, remaining_content 275 | return "", text 276 | 277 | 278 | def compute_score_batch(data_sources, solution_strs, ground_truths, extra_infos, prompt_key="prompt"): 279 | """Compute scores for a batch of data using the POLAR reward model for VERL. 280 | 281 | Args: 282 | data_sources: List of data sources. 283 | solution_strs: List of solution strings. 284 | ground_truths: List of ground truth strings or {"role": xxx, "content": xxx} messages. 285 | extra_infos: List of extra information dictionaries containing prompt_key, 286 | which is the dictionary-style input prompt for policy model and POLAR. 287 | 288 | Returns: 289 | scores: A list of computed scores for each data source. 290 | """ 291 | 292 | legacy_eval = False 293 | 294 | batch_data = [] 295 | for data_source, solution_str, ground_truth, extra_info in zip( 296 | data_sources, solution_strs, ground_truths, extra_infos, strict=True 297 | ): 298 | 299 | _, solution_str = extract_thinking_content(solution_str) 300 | 301 | if extra_info["ability"] == "math" and extra_info["split"] == "test": 302 | legacy_eval = True 303 | 304 | data = { 305 | "prompt": extra_info[prompt_key], 306 | "reference": ground_truth, 307 | "output": solution_str, 308 | "wrapper": "sft" 309 | } 310 | batch_data.append(data) 311 | 312 | if legacy_eval: 313 | # If the task is math, use rule-based rewards as test evaluation. 314 | from verl.utils.reward_score.math_verify import compute_score 315 | 316 | # For enhanced accuracy, we utilize Math-Verify (https://github.com/huggingface/Math-Verify). 317 | # Note: Math-Verify needs to be manually installed via pip: `pip install math-verify`. 318 | return [ 319 | compute_score(item["output"], item["reference"]) 320 | for item in batch_data 321 | ] 322 | 323 | client = get_reward_client() 324 | 325 | scores = client(batch_data) 326 | 327 | return scores 328 | 329 | 330 | if __name__ == "__main__": 331 | client = get_reward_client() 332 | data = [ 333 | { 334 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 335 | "reference": [{"role": "assistant", "content": "Beijing."}], 336 | "output": [{"role": "assistant", "content": "Beijing."}] 337 | }, 338 | { 339 | "prompt": [{"role": "user", "content": "What is the capital of China?"}], 340 | "reference": [{"role": "assistant", "content": "Beijing."}], 341 | "output": [{"role": "assistant", "content": "Shanghai."}] 342 | } 343 | ] 344 | scores = client(data) 345 | print(scores) 346 | --------------------------------------------------------------------------------