├── .clang-format ├── .dockerignore ├── .github └── workflows │ ├── format-check.yml │ ├── publish.yml │ ├── pytest.yml │ └── sphinx.yml ├── .gitignore ├── AUTHORS ├── CHANGES ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── assets └── qrcode.jpg ├── csrc ├── cugae │ └── gae.cu ├── custom_all_reduce │ ├── custom_all_reduce.cu │ ├── custom_all_reduce.cuh │ ├── custom_all_reduce_test.cu │ └── pybind.cpp ├── interval_op │ ├── interval_op.cpp │ └── interval_op.cu └── search │ ├── device_mesh.cpp │ ├── device_mesh.hpp │ ├── rpc.cpp │ ├── rpc.hpp │ ├── search.cpp │ ├── simulate.cpp │ └── simulate.hpp ├── docker-compose.yml ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ └── custom.css │ ├── arch.rst │ ├── conf.py │ ├── contributing.rst │ ├── customization.rst │ ├── distributed.rst │ ├── expconfig.rst │ ├── images │ ├── dfg │ │ ├── dpo.svg │ │ ├── grpo.svg │ │ ├── ppo.svg │ │ └── reinforce.svg │ ├── experiment_workflow.svg │ ├── ppo_rwd.svg │ ├── real_logo.svg │ ├── real_logo_dark.svg │ ├── rlhf_dfg.svg │ ├── rw_loss.svg │ ├── sft_loss.svg │ ├── timeline.svg │ └── vws.svg │ ├── impl.rst │ ├── index.rst │ ├── install.rst │ ├── intro.rst │ └── quickstart.rst ├── examples ├── cluster_config.json ├── customized_exp │ ├── ppo_ref_ema.py │ ├── ppo_sentiment.py │ └── scripts │ │ ├── run_ppo_ref_ema.sh │ │ └── run_ppo_sentiment.sh ├── load_and_eval_rw.py ├── new_algorithms │ ├── grpo │ │ ├── grpo.sh │ │ ├── grpo_exp.py │ │ └── grpo_interface.py │ └── reinforce │ │ ├── reinforce.sh │ │ ├── reinforce_exp.py │ │ └── reinforce_interface.py ├── profiling │ ├── allocations.jsonl │ ├── datasets.jsonl │ ├── interfaces.jsonl │ ├── models.jsonl │ └── profile.sh ├── scripts │ ├── distributed_ray │ │ ├── dpo.sh │ │ ├── ppo.sh │ │ ├── rw.sh │ │ └── sft.sh │ ├── distributed_slurm │ │ ├── dpo.sh │ │ ├── ppo.sh │ │ ├── rw.sh │ │ └── sft.sh │ └── local │ │ ├── dpo.sh │ │ ├── gen.sh │ │ ├── ppo.sh │ │ ├── ppo_manual.sh │ │ ├── ppo_minibatched.sh │ │ ├── ppo_symm.sh │ │ ├── rw.sh │ │ └── sft.sh └── visualize_dfg.py ├── pyproject.toml ├── pytest.ini ├── realhf ├── __init__.py ├── api │ ├── core │ │ ├── config.py │ │ ├── data_api.py │ │ ├── dfg.py │ │ ├── model_api.py │ │ └── system_api.py │ ├── from_hf │ │ ├── __init__.py │ │ ├── gemma.py │ │ ├── gpt2.py │ │ ├── llama.py │ │ ├── mistral.py │ │ ├── mixtral.py │ │ └── qwen2.py │ └── quickstart │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── device_mesh.py │ │ ├── entrypoint.py │ │ ├── model.py │ │ └── search.py ├── apps │ ├── __init__.py │ ├── main.py │ ├── profile_layers.py │ ├── quickstart.py │ └── remote.py ├── base │ ├── __init__.py │ ├── asyncio_utils.py │ ├── cluster.py │ ├── constants.py │ ├── datapack.py │ ├── gpu_utils.py │ ├── importing.py │ ├── logging.py │ ├── monitor.py │ ├── name_resolve.py │ ├── names.py │ ├── network.py │ ├── numpy_utils.py │ ├── ray_utils.py │ ├── recover.py │ ├── saveload_utils.py │ ├── security.py │ ├── seeding.py │ ├── slurm_utils.py │ ├── testing.py │ ├── timeutil.py │ └── topology.py ├── experiments │ ├── benchmark │ │ └── profile_exp.py │ └── common │ │ ├── check.py │ │ ├── common.py │ │ ├── dpo_exp.py │ │ ├── gen_exp.py │ │ ├── ppo_exp.py │ │ ├── rw_exp.py │ │ ├── sft_exp.py │ │ └── utils.py ├── impl │ ├── dataset │ │ ├── __init__.py │ │ ├── prompt_answer_dataset.py │ │ ├── prompt_dataset.py │ │ └── rw_paired_dataset.py │ └── model │ │ ├── __init__.py │ │ ├── backend │ │ ├── deepspeed.py │ │ ├── inference.py │ │ ├── megatron.py │ │ └── pipe_runner.py │ │ ├── comm │ │ ├── data_transfer.py │ │ ├── global_comm.py │ │ └── param_realloc.py │ │ ├── conversion │ │ └── hf_registry.py │ │ ├── interface │ │ ├── dpo_interface.py │ │ ├── gen_interface.py │ │ ├── ppo_interface.py │ │ ├── rw_interface.py │ │ └── sft_interface.py │ │ ├── modules │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── attn.py │ │ ├── embedding.py │ │ ├── mlp.py │ │ ├── moe │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── layer.py │ │ │ ├── router.py │ │ │ └── token_dispatcher.py │ │ ├── rms.py │ │ └── rotary.py │ │ ├── nn │ │ ├── flatten_param.py │ │ ├── real_llm_api.py │ │ ├── real_llm_base.py │ │ ├── real_llm_generate.py │ │ └── real_llm_parallel.py │ │ ├── parallelism │ │ ├── model_parallel │ │ │ ├── custom_all_reduce.py │ │ │ ├── mappings.py │ │ │ ├── modules.py │ │ │ └── utils.py │ │ └── pipeline_parallel │ │ │ ├── instruction.py │ │ │ ├── p2p.py │ │ │ ├── static_schedule.py │ │ │ └── tensor_storage.py │ │ └── utils │ │ ├── cuda_graph.py │ │ ├── dpo_functional.py │ │ ├── functional.py │ │ ├── logits_warper.py │ │ ├── moe.py │ │ ├── padding.py │ │ ├── ppo_functional.py │ │ └── random.py ├── scheduler │ ├── client.py │ ├── local │ │ └── client.py │ └── slurm │ │ ├── client.py │ │ └── utils.py ├── search_engine │ ├── __init__.py │ ├── enumerate.py │ ├── estimate.py │ ├── layers.py │ ├── param_realloc.py │ ├── search.py │ └── utils.py └── system │ ├── __init__.py │ ├── buffer.py │ ├── controller.py │ ├── master_worker.py │ ├── model_worker.py │ ├── request_reply_stream.py │ ├── worker_base.py │ └── worker_control.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── comm └── test_param_realloc.py ├── cpp_extensions ├── test_cugae.py ├── test_grouped_gemm.py └── test_interval_ops.py ├── data ├── test_dfg.py ├── test_load_data.py └── test_sequence_gather_split.py └── model ├── test_cpu_inference.py ├── test_distributed_load_hf.py └── test_generate.py /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -1 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveAssignments: false 6 | AlignConsecutiveDeclarations: false 7 | AlignEscapedNewlinesLeft: true 8 | AlignOperands: true 9 | AlignTrailingComments: true 10 | AllowAllParametersOfDeclarationOnNextLine: true 11 | AllowShortBlocksOnASingleLine: true 12 | AllowShortCaseLabelsOnASingleLine: true 13 | AllowShortFunctionsOnASingleLine: All 14 | AllowShortIfStatementsOnASingleLine: true 15 | AllowShortLoopsOnASingleLine: true 16 | AlwaysBreakAfterDefinitionReturnType: None 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: false 19 | AlwaysBreakTemplateDeclarations: true 20 | BinPackArguments: true 21 | BinPackParameters: true 22 | BraceWrapping: 23 | AfterClass: true 24 | AfterControlStatement: false 25 | AfterEnum: false 26 | AfterFunction: false 27 | AfterNamespace: false 28 | AfterObjCDeclaration: false 29 | AfterStruct: false 30 | AfterUnion: false 31 | BeforeCatch: false 32 | BeforeElse: false 33 | IndentBraces: false 34 | BreakBeforeBinaryOperators: NonAssignment 35 | BreakBeforeBraces: Attach 36 | BreakBeforeTernaryOperators: true 37 | BreakConstructorInitializersBeforeComma: false 38 | BreakAfterJavaFieldAnnotations: false 39 | BreakStringLiterals: true 40 | ColumnLimit: 100 41 | CommentPragmas: '^ IWYU pragma:' 42 | BreakBeforeInheritanceComma: false 43 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 44 | ConstructorInitializerIndentWidth: 4 45 | ContinuationIndentWidth: 4 46 | Cpp11BracedListStyle: true 47 | DisableFormat: false 48 | ExperimentalAutoDetectBinPacking: false 49 | FixNamespaceComments: true 50 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] 51 | IncludeCategories: 52 | - Regex: '^<.*\.h>' 53 | Priority: 1 54 | - Regex: '^<.*' 55 | Priority: 2 56 | - Regex: '.*' 57 | Priority: 3 58 | IncludeIsMainRegex: '([-_](test|unittest))?$' 59 | IndentCaseLabels: true 60 | IndentWidth: 2 61 | IndentWrappedFunctionNames: false 62 | JavaScriptQuotes: Leave 63 | JavaScriptWrapImports: true 64 | KeepEmptyLinesAtTheStartOfBlocks: false 65 | MacroBlockBegin: '' 66 | MacroBlockEnd: '' 67 | MaxEmptyLinesToKeep: 1 68 | NamespaceIndentation: None 69 | ObjCBlockIndentWidth: 2 70 | ObjCSpaceAfterProperty: false 71 | ObjCSpaceBeforeProtocolList: false 72 | PenaltyBreakBeforeFirstCallParameter: 1 73 | PenaltyBreakComment: 300 74 | PenaltyBreakFirstLessLess: 120 75 | PenaltyBreakString: 1000 76 | PenaltyExcessCharacter: 1000000 77 | PenaltyReturnTypeOnItsOwnLine: 200 78 | PointerAlignment: Right 79 | ReflowComments: true 80 | SortIncludes: false 81 | SpaceAfterCStyleCast: false 82 | SpaceAfterTemplateKeyword: false 83 | SpaceBeforeAssignmentOperators: true 84 | SpaceBeforeParens: ControlStatements 85 | SpaceInEmptyParentheses: false 86 | SpacesBeforeTrailingComments: 2 87 | SpacesInAngles: false 88 | SpacesInContainerLiterals: true 89 | SpacesInCStyleCastParentheses: false 90 | SpacesInParentheses: false 91 | SpacesInSquareBrackets: false 92 | Standard: Auto 93 | TabWidth: 8 94 | UseTab: Never 95 | ... 96 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Legacy codes 2 | .legacy/ 3 | .data/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | trace_result/ 10 | profile_result/ 11 | 12 | slurm_outs 13 | _data 14 | *.nfs* 15 | output 16 | logs 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | # dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | cover/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | .pybuilder/ 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # For a library or package, you might want to ignore these files since the code is 99 | # intended to run in multiple environments; otherwise, check them in: 100 | # .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # poetry 110 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 111 | # This is especially recommended for binary packages to ensure reproducibility, and is more 112 | # commonly ignored for libraries. 113 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 114 | #poetry.lock 115 | 116 | # pdm 117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 118 | #pdm.lock 119 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 120 | # in version control. 121 | # https://pdm.fming.dev/#use-with-ide 122 | .pdm.toml 123 | 124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 125 | __pypackages__/ 126 | 127 | # Celery stuff 128 | celerybeat-schedule 129 | celerybeat.pid 130 | 131 | # SageMath parsed files 132 | *.sage.py 133 | 134 | # Environments 135 | .env 136 | .venv 137 | env/ 138 | venv/ 139 | ENV/ 140 | env.bak/ 141 | venv.bak/ 142 | 143 | # Spyder project settings 144 | .spyderproject 145 | .spyproject 146 | 147 | # Rope project settings 148 | .ropeproject 149 | 150 | # mkdocs documentation 151 | /site 152 | 153 | # mypy 154 | .mypy_cache/ 155 | .dmypy.json 156 | dmypy.json 157 | 158 | # Pyre type checker 159 | .pyre/ 160 | 161 | # pytype static type analyzer 162 | .pytype/ 163 | 164 | # Cython debug symbols 165 | cython_debug/ 166 | 167 | # PyCharm 168 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 169 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 170 | # and can be added to the global gitignore or merged into this file. For a more nuclear 171 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 172 | #.idea/ 173 | 174 | # openai api key 175 | api_key.txt 176 | api_key.json 177 | 178 | ./*.sh 179 | *.png 180 | *.jpg 181 | *.pdf 182 | 183 | .vscode/ -------------------------------------------------------------------------------- /.github/workflows/format-check.yml: -------------------------------------------------------------------------------- 1 | name: Check Formatting 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | check_formatting: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v3 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.10' 17 | 18 | - name: Install Python dependencies 19 | run: | 20 | python3 -m pip install --upgrade pip 21 | pip install isort black clang-format 22 | 23 | - name: Check Python formatting with isort 24 | run: isort --check-only . 25 | 26 | - name: Check Python formatting with black 27 | run: black --check . 28 | 29 | - name: Check C++ formatting 30 | run: | 31 | find . -type f \( -name '*.c' -o -name '*.h' -o -name '*.cpp' -o -name '*.hpp' -o -name '*.cu' -o -name '*.cuh' \) -exec clang-format --dry-run --Werror {} + 32 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Check out repository 13 | uses: actions/checkout@v3 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.10' # Specify the Python version you need 19 | 20 | - name: Install dependencies 21 | run: | 22 | python3 -m pip install --upgrade pip 23 | python3 -m pip install -r requirements.txt 24 | 25 | - name: Build package 26 | run: python3 -m build -n --sdist 27 | 28 | - name: Publish to PyPI 29 | env: 30 | TWINE_USERNAME: __token__ 31 | TWINE_PASSWORD: ${{ secrets.PIP_TOKEN }} 32 | run: | 33 | python3 -m pip install twine 34 | twine upload dist/* -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: Run Pytest 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Check out repository 11 | uses: actions/checkout@v3 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.10' 17 | 18 | - name: Install dependencies 19 | run: | 20 | python3 -m pip install --upgrade pip 21 | python3 -m pip install -r requirements.txt 22 | python3 -m pip install pytest 23 | python3 -m pip install torch==2.3.1 24 | python3 -m pip install -e . --no-build-isolation 25 | 26 | - name: Run tests 27 | run: | 28 | pytest -m "not gpu" 29 | -------------------------------------------------------------------------------- /.github/workflows/sphinx.yml: -------------------------------------------------------------------------------- 1 | name: "Sphinx: Render docs" 2 | 3 | on: push 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | permissions: 9 | contents: write 10 | steps: 11 | - uses: actions/checkout@v4 12 | - name: Build HTML 13 | uses: garrett4wade/sphinx-action@master 14 | - name: Upload artifacts 15 | uses: actions/upload-artifact@v4 16 | with: 17 | name: html-docs 18 | path: docs/build/html/ 19 | - name: Deploy 20 | uses: peaceiris/actions-gh-pages@v3 21 | if: github.ref == 'refs/heads/main' 22 | with: 23 | github_token: ${{ secrets.GITHUB_TOKEN }} 24 | publish_dir: docs/build/html 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Legacy codes 2 | .legacy/ 3 | .data/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | trace_result/ 10 | profile_result/ 11 | 12 | slurm_outs 13 | _data 14 | *.nfs* 15 | output 16 | logs 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | cover/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | .pybuilder/ 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # For a library or package, you might want to ignore these files since the code is 99 | # intended to run in multiple environments; otherwise, check them in: 100 | # .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # poetry 110 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 111 | # This is especially recommended for binary packages to ensure reproducibility, and is more 112 | # commonly ignored for libraries. 113 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 114 | #poetry.lock 115 | 116 | # pdm 117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 118 | #pdm.lock 119 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 120 | # in version control. 121 | # https://pdm.fming.dev/#use-with-ide 122 | .pdm.toml 123 | 124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 125 | __pypackages__/ 126 | 127 | # Celery stuff 128 | celerybeat-schedule 129 | celerybeat.pid 130 | 131 | # SageMath parsed files 132 | *.sage.py 133 | 134 | # Environments 135 | .env 136 | .venv 137 | env/ 138 | venv/ 139 | ENV/ 140 | env.bak/ 141 | venv.bak/ 142 | 143 | # Spyder project settings 144 | .spyderproject 145 | .spyproject 146 | 147 | # Rope project settings 148 | .ropeproject 149 | 150 | # mkdocs documentation 151 | /site 152 | 153 | # mypy 154 | .mypy_cache/ 155 | .dmypy.json 156 | dmypy.json 157 | 158 | # Pyre type checker 159 | .pyre/ 160 | 161 | # pytype static type analyzer 162 | .pytype/ 163 | 164 | # Cython debug symbols 165 | cython_debug/ 166 | 167 | # PyCharm 168 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 169 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 170 | # and can be added to the global gitignore or merged into this file. For a more nuclear 171 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 172 | #.idea/ 173 | 174 | # openai api key 175 | api_key.txt 176 | api_key.json 177 | 178 | ./*.sh 179 | *.png 180 | *.pdf 181 | 182 | .vscode/ -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Zhiyu Mei, meizy20@mails.tsinghua.edu.cn 2 | Wei Fu, fuwth17@gmail.com -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/CHANGES -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG REAL_CPU_BASE_IMAGE 2 | ARG REAL_GPU_BASE_IMAGE 3 | 4 | # >>>>>> CPU image 5 | FROM ${REAL_CPU_BASE_IMAGE} as cpu 6 | 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | RUN apt update 9 | RUN apt install -y ca-certificates 10 | RUN sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list 11 | RUN sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list 12 | RUN apt update 13 | RUN apt install -y net-tools python3-pip pkg-config libopenblas-base libopenmpi-dev git 14 | 15 | RUN pip3 install -U pip 16 | RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 17 | # Install PyTorch in advance to prevent rebuilding this large Docker layer. 18 | RUN pip3 install torch==2.3.1 19 | 20 | COPY ./requirements.txt /requirements.txt 21 | RUN pip3 install -r /requirements.txt && rm /requirements.txt 22 | 23 | COPY . /realhf 24 | RUN REAL_CUDA=0 pip3 install -e /realhf --no-build-isolation 25 | WORKDIR /realhf 26 | 27 | # >>>>>> Documentation images 28 | # FROM cpu AS docs-builder 29 | # RUN pip install -U sphinx sphinx-nefertiti -i https://pypi.tuna.tsinghua.edu.cn/simple 30 | # RUN sphinx-build -M html /realhf/docs/source/ /realhf/docs/build/ 31 | FROM nginx:alpine AS docs 32 | COPY ./docs/build/html /usr/share/nginx/html 33 | EXPOSE 80 34 | CMD ["nginx", "-g", "daemon off;"] 35 | 36 | # >>>>>> GPU image 37 | FROM ${REAL_GPU_BASE_IMAGE} AS gpu 38 | 39 | ENV DEBIAN_FRONTEND=noninteractive 40 | RUN apt update 41 | RUN apt install -y ca-certificates 42 | RUN sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list 43 | RUN sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list 44 | RUN apt update 45 | RUN apt install -y net-tools \ 46 | libibverbs-dev librdmacm-dev ibverbs-utils \ 47 | rdmacm-utils python3-pyverbs opensm ibutils perftest 48 | 49 | RUN pip3 install -U pip 50 | RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 51 | 52 | # set environment variables for building transformer engine 53 | ENV NVTE_WITH_USERBUFFERS=1 NVTE_FRAMEWORK=pytorch MAX_JOBS=8 MPI_HOME=/usr/local/mpi 54 | ENV PATH="${PATH}:/opt/hpcx/ompi/bin:/opt/hpcx/ucx/bin" 55 | ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/hpcx/ompi/lib:/opt/hpcx/ucx/lib/" 56 | 57 | COPY ./requirements.txt /requirements.txt 58 | RUN pip3 install -r /requirements.txt && rm /requirements.txt 59 | 60 | # We don't use TransformerEngine's flash-attn integration, so it's okay to disrespect dependencies 61 | RUN pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.8 --no-deps --no-build-isolation 62 | RUN pip3 install flash-attn==2.4.2 --no-build-isolation 63 | # Install grouped_gemm for MoE acceleration 64 | RUN pip3 install git+https://github.com/tgale96/grouped_gemm.git@v0.1.4 --no-build-isolation --no-deps 65 | 66 | COPY . /realhf 67 | RUN REAL_CUDA=1 pip3 install -e /realhf --no-build-isolation 68 | WORKDIR /realhf -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include csrc *.cu *.cuh *.hpp *.cpp -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docs 2 | 3 | docs: 4 | docker compose down 5 | cd docs && make html 6 | docker compose up --build -------------------------------------------------------------------------------- /assets/qrcode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/assets/qrcode.jpg -------------------------------------------------------------------------------- /csrc/custom_all_reduce/pybind.cpp: -------------------------------------------------------------------------------- 1 | /* Copied from the vLLM project: https://github.com/vllm-project/vllm */ 2 | #include 3 | 4 | using fptr_t = uint64_t; 5 | fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, 6 | const std::vector &handles, const std::vector &offsets, 7 | int rank, bool full_nvlink); 8 | bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, bool full_nvlink); 9 | void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out); 10 | void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor ®_buffer, 11 | torch::Tensor &out); 12 | void dispose(fptr_t _fa); 13 | int meta_size(); 14 | void register_buffer(fptr_t _fa, torch::Tensor &t, const std::vector &handles, 15 | const std::vector &offsets); 16 | std::pair, std::vector> get_graph_buffer_ipc_meta(fptr_t _fa); 17 | void register_graph_buffers(fptr_t _fa, const std::vector &handles, 18 | const std::vector> &offsets); 19 | 20 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 21 | // vLLM custom all-reduce kernels 22 | pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce"); 23 | custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar"); 24 | custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar"); 25 | custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg"); 26 | custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg"); 27 | custom_ar.def("dispose", &dispose, "dispose"); 28 | custom_ar.def("meta_size", &meta_size, "meta_size"); 29 | custom_ar.def("register_buffer", ®ister_buffer, "register_buffer"); 30 | custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, 31 | "get_graph_buffer_ipc_meta"); 32 | custom_ar.def("register_graph_buffers", ®ister_graph_buffers, "register_graph_buffers"); 33 | } 34 | -------------------------------------------------------------------------------- /csrc/interval_op/interval_op.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | std::vector> merge_intervals( 5 | std::vector> intervals) { 6 | if (intervals.empty()) { return {}; } 7 | 8 | std::vector> merged; 9 | merged.push_back(intervals[0]); 10 | 11 | for (size_t i = 1; i < intervals.size(); ++i) { 12 | auto &lastInterval = merged.back(); 13 | const auto ¤tInterval = intervals[i]; 14 | 15 | if (lastInterval.second == currentInterval.first) { 16 | // Merge the intervals 17 | lastInterval.second = currentInterval.second; 18 | } else { 19 | // Add the current interval as it is 20 | merged.push_back(currentInterval); 21 | } 22 | } 23 | 24 | return merged; 25 | } 26 | 27 | PYBIND11_MODULE(interval_op, m) { 28 | m.def("merge_intervals", &merge_intervals, "Merge non-overlapping intervals."); 29 | } 30 | -------------------------------------------------------------------------------- /csrc/interval_op/interval_op.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA") 7 | #define CHECK_SHAPE(x, ...) \ 8 | TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \ 9 | #x " must have shape (" #__VA_ARGS__ ")") 10 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 11 | 12 | template 13 | __global__ void copyDataKernel(T *dst, const T *src, long *dst_offsets, long *src_offsets, 14 | long *sizes, long N) { 15 | long interval_id = blockIdx.x * blockDim.x + threadIdx.x; 16 | if (interval_id >= N) { return; } 17 | long chunk_id = blockIdx.y * blockDim.y + threadIdx.y; 18 | long interval_size = sizes[interval_id]; 19 | long chunk_offset = chunk_id * chunk_size; 20 | if (chunk_offset >= interval_size) { return; } 21 | long dst_offset = dst_offsets[interval_id]; 22 | long src_offset = src_offsets[interval_id]; 23 | long _size = interval_size - chunk_offset; 24 | long size = (chunk_size < _size) ? chunk_size : _size; 25 | memcpy(dst + dst_offset + chunk_offset, src + src_offset + chunk_offset, size * sizeof(T)); 26 | } 27 | 28 | template 29 | void set_intervals(const at::Tensor src, at::Tensor dst, const at::Tensor intervals, 30 | int max_interval_size) { 31 | CHECK_DEVICE(src); 32 | CHECK_DEVICE(dst); 33 | CHECK_DEVICE(intervals); 34 | 35 | CHECK_CONTIGUOUS(src); 36 | CHECK_CONTIGUOUS(dst); 37 | CHECK_CONTIGUOUS(intervals); 38 | 39 | TORCH_CHECK(src.dtype() == dst.dtype(), 40 | "Source and destination tensors must have the same dtype"); 41 | 42 | TORCH_CHECK(intervals.dtype() == torch::kLong, "intervals must be of type long"); 43 | 44 | long N = intervals.size(0); 45 | CHECK_SHAPE(intervals, N, 2); 46 | 47 | at::Tensor interval_sizes = intervals.select(1, 1) - intervals.select(1, 0); 48 | at::Tensor dst_offsets = intervals.select(1, 0).contiguous(); 49 | at::Tensor src_offsets = interval_sizes.cumsum(0, at::kLong) - interval_sizes; 50 | 51 | // Launch CUDA kernel 52 | const int threads_per_block_x = 32; 53 | const int threads_per_block_y = 32; 54 | 55 | const int num_blocks_x = (N + threads_per_block_x - 1) / threads_per_block_x; 56 | 57 | const int n_chunks = (max_interval_size + chunk_size - 1) / chunk_size; 58 | const int num_blocks_y = (n_chunks + threads_per_block_y - 1) / threads_per_block_y; 59 | 60 | const dim3 numBlocks(num_blocks_x, num_blocks_y); 61 | const dim3 threadsPerBlock(threads_per_block_x, threads_per_block_y); 62 | 63 | copyDataKernel<<>>( 64 | dst.data_ptr(), src.data_ptr(), dst_offsets.data_ptr(), 65 | src_offsets.data_ptr(), interval_sizes.data_ptr(), N); 66 | } 67 | 68 | template 69 | at::Tensor slice_intervals(const at::Tensor src, const at::Tensor intervals, long total_size, 70 | int max_interval_size) { 71 | CHECK_DEVICE(src); 72 | CHECK_DEVICE(intervals); 73 | 74 | CHECK_CONTIGUOUS(src); 75 | CHECK_CONTIGUOUS(intervals); 76 | 77 | TORCH_CHECK(intervals.dtype() == torch::kLong, "intervals must be of type long"); 78 | 79 | long N = intervals.size(0); 80 | CHECK_SHAPE(intervals, N, 2); 81 | 82 | at::Tensor dst = at::empty({total_size}, src.options()); 83 | 84 | at::Tensor interval_sizes = intervals.select(1, 1) - intervals.select(1, 0); 85 | at::Tensor src_offsets = intervals.select(1, 0).contiguous(); 86 | at::Tensor dst_offsets = interval_sizes.cumsum(0, at::kLong) - interval_sizes; 87 | 88 | // Launch CUDA kernel 89 | const int threads_per_block_x = 32; 90 | const int threads_per_block_y = 32; 91 | 92 | const int num_blocks_x = (N + threads_per_block_x - 1) / threads_per_block_x; 93 | 94 | const int n_chunks = (max_interval_size + chunk_size - 1) / chunk_size; 95 | const int num_blocks_y = (n_chunks + threads_per_block_y - 1) / threads_per_block_y; 96 | 97 | const dim3 numBlocks(num_blocks_x, num_blocks_y); 98 | const dim3 threadsPerBlock(threads_per_block_x, threads_per_block_y); 99 | 100 | copyDataKernel<<>>( 101 | dst.data_ptr(), src.data_ptr(), dst_offsets.data_ptr(), 102 | src_offsets.data_ptr(), interval_sizes.data_ptr(), N); 103 | return dst; 104 | } 105 | 106 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 107 | m.def("set_intervals_fp32", &set_intervals, "Set intervals of a 1D tensor"); 108 | m.def("set_intervals_fp16", &set_intervals, "Set intervals of a 1D tensor"); 109 | m.def("set_intervals_bf16", &set_intervals, "Set intervals of a 1D tensor"); 110 | m.def("slice_intervals_fp32", &slice_intervals, "slice intervals of a 1D tensor"); 111 | m.def("slice_intervals_fp16", &slice_intervals, "slice intervals of a 1D tensor"); 112 | m.def("slice_intervals_bf16", &slice_intervals, 113 | "slice intervals of a 1D tensor"); 114 | } 115 | -------------------------------------------------------------------------------- /csrc/search/device_mesh.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // DeviceMesh::DeviceMesh() 6 | // : device_mesh_name(""), n_nodes(0), n_gpus(0), node_names({}), gpu_ids({}) { 7 | // }; 8 | 9 | DeviceMesh::DeviceMesh(int n_nodes, int n_gpus_per_node, std::vector> mapping, 10 | std::string global_mesh_name, std::string name) 11 | : n_nodes(n_nodes), 12 | n_gpus_per_node(n_gpus_per_node), 13 | mapping(mapping), 14 | global_mesh_name(global_mesh_name), 15 | name(name) { 16 | assert(n_nodes == static_cast(mapping.size())); 17 | for (int i = 0; i < n_nodes; i++) { 18 | assert(n_gpus_per_node == static_cast(mapping[i].size())); 19 | } 20 | }; 21 | 22 | bool is_all_overlap(std::vector device_meshes, DeviceMesh device_mesh) { 23 | for (DeviceMesh *other : device_meshes) { 24 | if (!device_mesh.overlap(*other)) return false; 25 | } 26 | return true; 27 | }; 28 | 29 | bool is_all_overlap(std::unordered_set device_meshes, DeviceMesh device_mesh) { 30 | for (DeviceMesh *other : device_meshes) { 31 | if (!device_mesh.overlap(*other)) return false; 32 | } 33 | return true; 34 | }; 35 | 36 | bool DeviceMesh::contain(const DeviceMesh &other) { 37 | // check whether one device mapping is contained by another by 38 | // checking 1. whether global_mesh_name is identical 39 | // 2. whether mapping of one device mesh is contained by the other one 40 | if (global_mesh_name != other.global_mesh_name) return false; 41 | for (int i = 0; i < n_nodes; i++) { 42 | for (int j = 0; j < n_gpus_per_node; j++) { 43 | if (mapping[i][j] == 0 && other.mapping[i][j] == 1) return false; 44 | } 45 | } 46 | return true; 47 | }; 48 | 49 | bool DeviceMesh::contained_by(const DeviceMesh &other) { 50 | if (global_mesh_name != other.global_mesh_name) return false; 51 | for (int i = 0; i < n_nodes; i++) { 52 | for (int j = 0; j < n_gpus_per_node; j++) { 53 | if (mapping[i][j] == 1 && other.mapping[i][j] == 0) return false; 54 | } 55 | } 56 | return true; 57 | }; 58 | 59 | bool DeviceMesh::overlap(const DeviceMesh &other) { 60 | if (global_mesh_name != other.global_mesh_name) return false; 61 | for (int i = 0; i < n_nodes; i++) { 62 | for (int j = 0; j < n_gpus_per_node; j++) { 63 | if (mapping[i][j] == 1 && other.mapping[i][j] == 1) return true; 64 | } 65 | } 66 | return false; 67 | }; 68 | 69 | ModelParallelStrategy::ModelParallelStrategy(int num_pp, int num_dp, int num_mp) 70 | : num_pp(num_pp), num_dp(num_dp), num_mp(num_mp) {}; 71 | 72 | bool ModelParallelStrategy::operator==(const ModelParallelStrategy &other) const { 73 | return num_pp == other.num_pp && num_dp == other.num_dp && num_mp == other.num_mp; 74 | }; 75 | 76 | bool DeviceMesh::operator==(const DeviceMesh &other) const { 77 | return name == other.name && global_mesh_name == other.global_mesh_name; 78 | }; 79 | 80 | std::string ModelParallelStrategy::to_string() { 81 | return "num_pp:" + std::to_string(num_pp) + ";" + "num_dp:" + std::to_string(num_dp) + ";" 82 | + "num_mp:" + std::to_string(num_mp); 83 | }; 84 | 85 | std::string ModelParallelStrategy::to_key() { 86 | return std::to_string(num_pp) + "," + std::to_string(num_mp) + "," + std::to_string(num_dp); 87 | } -------------------------------------------------------------------------------- /csrc/search/device_mesh.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEVICE_MESH_HPP 2 | #define DEVICE_MESH_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | // #include 9 | 10 | class RPCInstance; 11 | 12 | class DeviceMesh { 13 | public: 14 | int n_nodes; 15 | int n_gpus_per_node; 16 | std::vector> mapping; 17 | std::string global_mesh_name; 18 | std::string name; 19 | RPCInstance *pre_task = nullptr; 20 | 21 | // DeviceMesh(); 22 | DeviceMesh(int n_nodes, int n_gpus_per_node, std::vector> mapping, 23 | std::string global_mesh_name, std::string name); 24 | 25 | bool overlap(const DeviceMesh &other); 26 | bool contain(const DeviceMesh &other); 27 | bool contained_by(const DeviceMesh &other); 28 | 29 | bool operator==(const DeviceMesh &other) const; 30 | }; 31 | 32 | bool is_all_overlap(std::vector device_meshes, DeviceMesh device_mesh); 33 | bool is_all_overlap(std::unordered_set device_meshes, DeviceMesh device_mesh); 34 | 35 | class ModelParallelStrategy { 36 | public: 37 | int num_pp, num_dp, num_mp; 38 | 39 | ModelParallelStrategy(int num_pp, int num_dp, int num_mp); 40 | 41 | bool operator==(const ModelParallelStrategy &other) const; 42 | 43 | std::string to_string(); 44 | std::string to_key(); 45 | }; 46 | 47 | class ModelDeviceMapping {}; 48 | 49 | #endif // DEVICE_MESH_HPP -------------------------------------------------------------------------------- /csrc/search/rpc.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RPC_HPP 2 | #define RPC_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class CommStats { 10 | public: 11 | uint64_t local_send, local_recv, remote_send, remote_recv, offload_store, offload_load; 12 | 13 | CommStats(uint64_t local_send, uint64_t local_recv, uint64_t remote_send, uint64_t remote_recv, 14 | uint64_t offload_store, uint64_t offload_load); 15 | }; 16 | 17 | class RPC { 18 | public: 19 | std::string model_name; 20 | std::string rpc_name; 21 | // interface_type: 0=generate, 1=train_step, 2=inference 22 | std::string interface_type; 23 | 24 | RPC(std::string model_name, std::string rpc_name, std::string interface_type); 25 | }; 26 | 27 | class RPCExecution { 28 | public: 29 | RPC *rpc_ptr; 30 | DeviceMesh &device_mesh; 31 | ModelParallelStrategy &model_parallel_strategy; 32 | uint64_t time_cost, mem, static_mem; 33 | 34 | RPCExecution(RPC *rpc_ptr, DeviceMesh &device_mesh, 35 | ModelParallelStrategy &model_parallel_strategy, uint64_t time_cost, uint64_t mem, 36 | uint64_t static_mem); 37 | 38 | std::string to_string(); 39 | }; 40 | 41 | class OverlapGroup { 42 | public: 43 | std::unordered_set rpc_executions; 44 | std::unordered_set device_meshes; 45 | uint64_t mem_static; 46 | uint64_t mem_active; 47 | 48 | bool maybe_add(RPCExecution *rpc_exe); 49 | }; 50 | 51 | class DeviceMeshGroup { 52 | public: 53 | // std::string device_mesh_name; 54 | std::vector overlap_groups; 55 | 56 | void add_to_groups(RPCExecution *rpc_exe); 57 | }; 58 | 59 | class GroupedRPCExecutions { 60 | public: 61 | // std::unordered_map dn_to_group; 62 | DeviceMeshGroup group; 63 | 64 | void add(RPCExecution *rpc_exe); 65 | void resolve(RPCExecution *rpc_exe); 66 | void offload(std::string model_name); 67 | uint64_t total_mem_cost(); 68 | }; 69 | 70 | class RPCInstance { 71 | public: 72 | RPC *rpc_ptr; 73 | int id; 74 | std::string name; 75 | std::vector children; 76 | std::vector parents; 77 | std::vector tmp_children; 78 | std::vector tmp_parents; 79 | std::vector tmp_ris; // pointers to tmp rpc instances 80 | std::vector tmp_exes; // pointers to tmp rpc executions 81 | 82 | RPCExecution *rpc_exe_ptr = nullptr; 83 | RPCExecution *param_sync_rpc_exe_ptr = nullptr; 84 | bool param_sync = false; 85 | uint64_t param_sync_size = 0; 86 | bool offload = false; 87 | uint64_t offload_size = 0; 88 | 89 | RPCInstance(RPC *rpc_ptr, int id, std::string name); 90 | 91 | uint64_t ready_time = 0, start_time = 0, end_time = 0; 92 | 93 | void remove_parent(RPCInstance *parent); 94 | void remove_child(RPCInstance *child); 95 | void add_parent(RPCInstance *parent); 96 | void add_child(RPCInstance *child); 97 | 98 | void add_tmp_parent(RPCInstance *parent); 99 | void add_tmp_child(RPCInstance *child); 100 | void remove_tmp_parent(RPCInstance *parent); 101 | void remove_tmp_child(RPCInstance *child); 102 | 103 | void resolve_parameter_sync(std::vector tmp_graph, 104 | std::unordered_map &cost_table); 105 | // void resolve_offload(std::vector tmp_graph, 106 | // CommStats& comm_stats); 107 | }; 108 | 109 | uint64_t parameter_sync_cost(uint64_t param_size_bytes, RPCExecution *src, RPCExecution *dst, 110 | std::unordered_map &cost_table); 111 | 112 | uint64_t remote_param_sync_size(uint64_t size, RPCExecution *src, RPCExecution *dst); 113 | 114 | // class ModelConfig { 115 | // std::string model_name; 116 | // uint64_t param_size_bytes; 117 | 118 | // ModelConfig(std::string model_name, uint64_t param_size_bytes); 119 | // }; 120 | 121 | #endif -------------------------------------------------------------------------------- /csrc/search/simulate.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SIMULATE_HPP 2 | #define SIMULATE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class SimulateResult { 10 | public: 11 | uint64_t end_time; 12 | bool oom; 13 | uint64_t mem_cost; 14 | std::vector index; 15 | std::vector rpc_exe_list; 16 | double used_time = 0; 17 | 18 | SimulateResult(); 19 | 20 | SimulateResult(uint64_t end_time, bool oom, uint64_t mem_cost, std::vector &index); 21 | 22 | SimulateResult &operator=(const SimulateResult &other); 23 | }; 24 | 25 | SimulateResult simulate( 26 | std::vector &graph, std::unordered_map &cost_table, 27 | std::unordered_map &model_sizes, 28 | std::unordered_map &rpc_table, 29 | std::unordered_map> &rpc_exe_table, 30 | std::unordered_map> &ri_table, 31 | std::unordered_map> &model_name_ri_table, 32 | std::vector &sorted_rpc_names, std::vector &index); 33 | 34 | // Comparator for priority queue 35 | struct CompareEndTime { 36 | bool operator()(SimulateResult const &r1, SimulateResult const &r2) { 37 | // We want largest end_time at the top of the queue, so we reverse the comparison 38 | return r1.end_time < r2.end_time; 39 | } 40 | }; 41 | 42 | class MinEndTimeQueue { 43 | public: 44 | MinEndTimeQueue(int capacity) : k(capacity) {} 45 | 46 | void insert(SimulateResult r) { 47 | if (queue.size() < k) { 48 | // std::cout << "push " << "end_time: " << r.end_time << " qsize " << queue.size() << 49 | // std::endl; 50 | queue.push(r); 51 | } else if (r.end_time < queue.top().end_time) { 52 | // std::cout << "push " << "end_time: " << r.end_time << " qsize " << queue.size() << 53 | // std::endl; 54 | queue.pop(); 55 | queue.push(r); 56 | } 57 | } 58 | 59 | std::priority_queue, CompareEndTime> &getQueue() { 60 | return queue; 61 | } 62 | 63 | private: 64 | std::priority_queue, CompareEndTime> queue; 65 | int k; 66 | }; 67 | 68 | void mergeMinEndTimeQueues(MinEndTimeQueue &target, MinEndTimeQueue &q1); 69 | 70 | class CompareReadyTime { 71 | public: 72 | bool operator()(RPCInstance *r1, RPCInstance *r2) { return r1->ready_time > r2->ready_time; } 73 | }; 74 | 75 | #endif // SIMULATE_HPP -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | web: 4 | build: 5 | context: . 6 | dockerfile: Dockerfile 7 | target: docs 8 | args: 9 | REAL_CPU_BASE_IMAGE: ubuntu:22.04 10 | REAL_GPU_BASE_IMAGE: nvcr.io/nvidia/pytorch:23.10-py3 11 | ports: 12 | - "7780:80" -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | table { 2 | width: 100%; 3 | border-collapse: collapse; 4 | } 5 | 6 | table th, table td { 7 | text-align: center; 8 | vertical-align: middle; 9 | } -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. 11 | import pathlib 12 | import sys 13 | 14 | sys.path.insert(0, (pathlib.Path(__file__).parents[2] / "realhf").resolve().as_posix()) 15 | 16 | project = "ReaL" 17 | copyright = "2024, Wei Fu & Zhiyu Mei" 18 | author = "Wei Fu & Zhiyu Mei" 19 | release = "0.3.0" 20 | 21 | # -- General configuration --------------------------------------------------- 22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 23 | 24 | extensions = [] 25 | 26 | templates_path = ["_templates"] 27 | exclude_patterns = [] 28 | 29 | # -- Options for HTML output ------------------------------------------------- 30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 31 | 32 | html_theme = "sphinx_nefertiti" 33 | html_static_path = ["_static"] 34 | 35 | extensions = [ 36 | "sphinx.ext.duration", 37 | "sphinx.ext.doctest", 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.autosummary", 40 | "sphinx.ext.githubpages", 41 | ] 42 | 43 | 44 | def setup(app): 45 | app.add_css_file("custom.css") 46 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | ############## 2 | Contributing 3 | ############## 4 | 5 | .. 6 | This repository is developed and maintained by `Wei Fu `_ 7 | 8 | .. 9 | and `Zhiyu Mei `_, both of whom are 10 | 11 | .. 12 | PhD students at `IIIS, Tsinghua University `_ 13 | 14 | .. 15 | advised by Professor `Yi Wu `_. 16 | 17 | .. 18 | We acknowledge that due to limited time and resources, 19 | 20 | .. 21 | the quality of the documentation and code in this repository is not very high. 22 | 23 | .. 24 | As a result, it can be quite challenging for potential developers to 25 | 26 | .. 27 | read the code and contribute new features. 28 | 29 | If you wish to contribute to this repository or have any questions about 30 | the code, please do not hesitate to raise issues or contact us directly. 31 | We will do our best to assist you. Currently, there is no template for 32 | issues or pull requests. 33 | 34 | We hope the open-source community can help improve this repository and 35 | enable RLHF technology to truly empower the applications of LLM. 36 | 37 | *************** 38 | Documentation 39 | *************** 40 | 41 | The source code is documented using Sphinx in the ``docs`` folder. On a 42 | node with docker-compose installed, run 43 | 44 | .. code:: bash 45 | 46 | make docs 47 | 48 | Then the documentation will be available at ``http://localhost:7780``. 49 | 50 | Every time the documentation files are changed, you should run the above 51 | command to update the documentation. 52 | 53 | The GitHub Pages will be updated automatically after the PR is merged. 54 | 55 | ************ 56 | Formatting 57 | ************ 58 | 59 | .. code:: bash 60 | 61 | # For .py files 62 | docformatter -i ${FILE} && isort ${FILE} && black -q ${FILE} 63 | # For C/C++ files 64 | clang-format -i ${FILE} 65 | # For documentation 66 | rstfmt docs 67 | 68 | ********* 69 | Testing 70 | ********* 71 | 72 | .. code:: bash 73 | 74 | # Run CPU tests 75 | pytest -m "not gpu" 76 | # Run CPU tests and GPU tests that require a single GPU 77 | pytest -m "not distributed" 78 | # On a node with multiple GPUs, run all tests 79 | pytest 80 | 81 | ************************ 82 | Building Docker Images 83 | ************************ 84 | 85 | .. code:: bash 86 | 87 | # Build the GPU image 88 | docker build -t real-gpu:24.03-0.3.0 -f Dockerfile --target gpu --build-arg REAL_GPU_BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3 --build-arg REAL_CPU_BASE_IMAGE=ubuntu:22.04 . 89 | # Build the CPU image 90 | docker build -t real-cpu:22.04-0.3.0 -f Dockerfile --target cpu --build-arg REAL_GPU_BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3 --build-arg REAL_CPU_BASE_IMAGE=ubuntu:22.04 . 91 | -------------------------------------------------------------------------------- /docs/source/expconfig.rst: -------------------------------------------------------------------------------- 1 | ################ 2 | Configurations 3 | ################ 4 | 5 | .. note:: 6 | 7 | This page serves as a reference manual for the configuration objects, 8 | i.e., you can check which attributes can be modified and their 9 | default values. You don't need to read through this page before 10 | running experiments! 11 | 12 | Please check the :doc:`quickstart` and :doc:`customization` sections 13 | for concrete examples of running experiments. 14 | 15 | We illustrate configurations for quickstart experiments in this page. 16 | Each type of experiment (e.g., SFT, PPO) corresponds to a specific 17 | configuration object (e.g., :class:`realhf.SFTConfig` for SFT). 18 | 19 | Since ReaL uses `Hydra `_ for configuration 20 | management, users can override these options provided by the class 21 | recursively with command line arguments. 22 | 23 | .. currentmodule:: realhf 24 | 25 | *************************** 26 | Experiment Configurations 27 | *************************** 28 | 29 | .. autoclass:: ExperimentSaveEvalControl 30 | 31 | .. autoclass:: CommonExperimentConfig 32 | 33 | .. autoclass:: SFTConfig 34 | 35 | .. autoclass:: RWConfig 36 | 37 | .. autoclass:: DPOConfig 38 | 39 | .. autoclass:: GenerationHyperparameters 40 | 41 | .. autoclass:: PPOHyperparameters 42 | 43 | .. autoclass:: PPOConfig 44 | 45 | .. autoclass:: GenerationConfig 46 | 47 | ********************** 48 | Model Configurations 49 | ********************** 50 | 51 | .. autoclass:: ModelFamily 52 | 53 | .. autoclass:: ModelTrainEvalConfig 54 | 55 | .. autoclass:: OptimizerConfig 56 | 57 | .. autoclass:: ParallelismConfig 58 | 59 | .. autoclass:: MFCConfig 60 | 61 | .. autoclass:: ReaLModelConfig 62 | 63 | ************************ 64 | Dataset Configurations 65 | ************************ 66 | 67 | .. autoclass:: PromptAnswerDatasetConfig 68 | 69 | .. autoclass:: PairedComparisonDatasetConfig 70 | 71 | .. autoclass:: PromptOnlyDatasetConfig 72 | 73 | ******************************************** 74 | Data Structure for Interfaces and Datasets 75 | ******************************************** 76 | 77 | .. autoclass:: realhf.SequenceSample 78 | :members: 79 | 80 | **************** 81 | Dataflow Graph 82 | **************** 83 | 84 | .. autoclass:: realhf.MFCDef 85 | 86 | ***************************** 87 | System-Level Configurations 88 | ***************************** 89 | 90 | .. note:: 91 | 92 | These configurations are not supposed to be modified by users. They 93 | are used to help understand the code architecture of ReaL. 94 | 95 | .. autoclass:: realhf.ModelShardID 96 | 97 | .. autoclass:: realhf.ModelName 98 | 99 | .. autoclass:: realhf.ModelVersion 100 | 101 | .. autoclass:: realhf.Model 102 | 103 | .. autoclass:: realhf.ModelBackend 104 | :members: 105 | :undoc-members: _initialize 106 | 107 | .. autoclass:: realhf.PipelinableEngine 108 | :members: 109 | :undoc-members: 110 | 111 | .. autoclass:: realhf.ModelInterface 112 | :members: 113 | :undoc-members: 114 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ReaL documentation master file, created by 3 | sphinx-quickstart on Mon Jun 10 10:57:12 2024. 4 | You can adapt this file completely to your liking, but it should at least 5 | contain the root `toctree` directive. 6 | 7 | ################################## 8 | Welcome to ReaL's documentation! 9 | ################################## 10 | 11 | ***************** 12 | 🚀 Get Started 🚀 13 | ***************** 14 | 15 | For users new to ReaL, we recommend starting with the :doc:`quickstart` 16 | section to learn how to run simple experiments on a local node. If you 17 | have multiple nodes available, please read the :doc:`distributed` 18 | section to learn how to run experiments on a cluster. These tutorials 19 | cover the basic usage of the implemented algorithms in ReaL, including 20 | SFT, Reward Modeling, DPO, and PPO, and do not require understanding the 21 | code structure. 22 | 23 | For advanced users, we recommend proceeding to the :doc:`customization` 24 | section to learn how to customize the algorithms and models in ReaL. 25 | This requires an understanding of how an algorithm and its experiment 26 | configuration are defined in ReaL (i.e., as a dataflow graph), but 27 | understanding the system-wide implementation (e.g., model workers) is 28 | not mandatory. 29 | 30 | For potential developers, please refer to the :doc:`impl` and the 31 | :doc:`arch` sections for a deeper understanding of the system 32 | architecture. 33 | 34 | Besides these illustrations, we present the reference manual of various 35 | configuration objects in the :doc:`expconfig` section, and a brief 36 | overview of the system architecture in the :doc:`intro` section. 37 | 38 | ************** 39 | ⭐ Contents ⭐ 40 | ************** 41 | 42 | .. toctree:: 43 | :maxdepth: 3 44 | 45 | intro 46 | install 47 | expconfig 48 | quickstart 49 | distributed 50 | customization 51 | impl 52 | arch 53 | 54 | contributing 55 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | ############## 2 | Installation 3 | ############## 4 | 5 | *************** 6 | Docker Images 7 | *************** 8 | 9 | The easiest way to run ReaL is by using the provided Docker images. We 10 | offer a CPU-only image for launching experiments and a runtime GPU image 11 | for deployment in a cluster. The Dockerfile is also available in the 12 | repository. 13 | 14 | To pull the images, run: 15 | 16 | .. code:: console 17 | 18 | $ docker pull docker.io/garrett4wade/real-cpu:22.04-0.3.0 19 | $ docker pull docker.io/garrett4wade/real-gpu:24.03-py3-0.3.0 20 | 21 | The CPU image is built from "ubuntu:22.04" and the GPU image is built 22 | from "nvcr.io/nvidia/pytorch:24.03-py3". You can check the latest docker 23 | image version `here 24 | `_. 25 | 26 | After pulling the Docker images, run your Docker container locally on a 27 | GPU node with the following command: 28 | 29 | .. code:: console 30 | 31 | $ docker run -it --rm --gpus all --mount type=bind,src=/path/outside/container,dst=/realhf garrett4wade/real-gpu:24.03-py3-0.3.0 bash 32 | 33 | There is an editable installation at ``/realhf`` inside the container, 34 | so your change to the code outside the container should automatically 35 | takes effect. 36 | 37 | ***************************** 38 | Install From PyPI or Source 39 | ***************************** 40 | 41 | If you prefer not to use the provided Docker image, you can also start 42 | with an image provided by NVIDA (e.g., 43 | ``nvcr.io/nvidia/pytorch:24.03-py3``) and install ReaL from PyPI or from 44 | the source. 45 | 46 | .. note:: 47 | 48 | We don't upload a pre-built wheel to PyPI, so the installation will 49 | require compiling the C++ and CUDA extensions. Control whether to 50 | install the extentions with environment variables ``REAL_CUDA`` and 51 | ``REAL_NO_EXT``. 52 | 53 | The CUDA extention will be installed only if ``REAL_CUDA`` is set to 54 | 1. No extention will be installed if ``REAL_NO_EXT`` is set to 1. 55 | 56 | If you don't want to compile the extensions, please use the provided 57 | Docker images. 58 | 59 | First, clone the repository and install all dependencies: 60 | 61 | .. code:: console 62 | 63 | $ pip install -U pip 64 | $ git clone https://github.com/openpsi-project/ReaLHF 65 | $ cd ReaLHF 66 | $ pip install -r requirements.txt 67 | 68 | On a GPU machine, also install the required runtime packages: 69 | 70 | .. code:: console 71 | 72 | $ export MAX_JOBS=8 # Set the number of parallel jobs for compilation. 73 | $ pip install git+https://github.com/NVIDIA/TransformerEngine.git@v1.8 --no-deps --no-build-isolation 74 | $ pip install flash_attn==2.4.2 --no-build-isolation 75 | $ pip3 install git+https://github.com/tgale96/grouped_gemm.git@v0.1.4 --no-build-isolation --no-deps # For MoE 76 | 77 | .. note:: 78 | 79 | ``MAX_JOBS`` sets the number of parallel jobs for compilation. A 80 | larger value will consume more memory (and potentially lead to OOM 81 | stuck) and CPU resources. Adjust the value according to your 82 | machine's specifications. 83 | 84 | Install ReaLHF from source (recommended, for the latest build): 85 | 86 | .. code:: console 87 | 88 | $ git clone https://github.com/openpsi-project/ReaLHF 89 | $ cd ReaLHF 90 | $ REAL_CUDA=1 pip install -e . --no-build-isolation 91 | 92 | Or install from PyPI (for stable build): 93 | 94 | .. code:: console 95 | 96 | $ REAL_CUDA=1 pip install realhf --no-build-isolation 97 | 98 | The PyPI package allows you to launch existing experiments with the 99 | quickstart command. If you want to modify the code, you must clone the 100 | source code and install it from the source. 101 | 102 | Next, check :doc:`quickstart` for instructions on running experiments. 103 | -------------------------------------------------------------------------------- /docs/source/intro.rst: -------------------------------------------------------------------------------- 1 | ############## 2 | Introduction 3 | ############## 4 | 5 | ********************************* 6 | Limitations of Existing Systems 7 | ********************************* 8 | 9 | We observe two major limitations based on our profiling of the previous 10 | RLHF systems, as shown in the :ref:`timeline`. 11 | 12 | .. _timeline: 13 | 14 | .. figure:: images/timeline.svg 15 | :alt: timeline 16 | 17 | Timeline Figure 18 | 19 | Execution timelines of ReaL and existing systems based on profiling. 20 | 21 | First, when models are distributed to every GPU node that applies the 22 | same parallelization strategy, such as in `DeepSpeed-Chat 23 | `_, 24 | it is often over-parallelized. Over-parallelization leads to substantial 25 | synchronization and communication overhead (the light purple bars). 26 | 27 | An alternative way is to assign different models to different GPU nodes, 28 | where models can execute concurrently, such as `OpenRLHF 29 | `_. However, our second 30 | observation is that such asymmetric parallelization often causes 31 | under-utilization of the GPUs (e.g., the gray areas) because of the 32 | dependencies between tasks. 33 | 34 | The key idea of ReaL is to enable dynamic **reallocation of model 35 | parameters** between GPUs to improve the efficiency of the entire RLHF 36 | training process. 37 | 38 | By first choosing a parallelization strategy tailored for each 39 | computation workload (e.g., pipelining for Generation and tensor 40 | parallelism for Training) and then executing these calls concurrently 41 | with a smaller parallelization degree (e.g., Actor and Critic in 42 | Training), we can eliminate redundant communication while maximizing GPU 43 | utilization, effectively addressing the limitations of prior solutions. 44 | 45 | ************************ 46 | Performance Comparison 47 | ************************ 48 | 49 | We show throughput comparison with the state-of-the-art open-source 50 | systems in the following figure. 51 | 52 | (In the following figure, as the number of GPUs increases, the model 53 | size scales up from LLaMA 7B, LLaMA 13B, and CodeLLaMA 34B, to the 54 | largest LLaMA 70B.) 55 | 56 | .. image:: images/vws.svg 57 | 58 | .. _est_time_table: 59 | 60 | +--------------+---------------+---------------+---------------+ 61 | | System | DeepSpeedChat | OpenRLHF | ReaL | 62 | +==============+===============+===============+===============+ 63 | | Time (hours) | 141.5 | 152.8 | **17.0** | 64 | +--------------+---------------+---------------+---------------+ 65 | 66 | We also show the estimated time for completing the entire full-scale 67 | 4*70B RLHF training process, composed of 4 iterations with 400 steps for 68 | each iteration as for LLaMA-2. 69 | 70 | .. 71 | "Scale Actor" maintains the sizes 72 | 73 | .. 74 | of Critic and Reward at 7B while increasing the sizes of Actor and Reference with the number of GPUs. 75 | 76 | .. 77 | "Scale Critic" follows the opposite approach, and 78 | 79 | .. 80 | "Scale Both" increases sizes of all models proportionately. 81 | -------------------------------------------------------------------------------- /examples/cluster_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cluster_type": "slurm", 3 | "cluster_name": "my_cluster", 4 | "fileroot": "/path/to/my/file/system", 5 | "default_mount": "/path/to/my/file/system:/path/to/my/file/system,/dev/infiniband:/dev/infiniband,/sys/class/infiniband_verbs:/sys/class/infiniband_verbs", 6 | "node_type_from_node_name": { 7 | "NODE\\d{2}$": "a100" 8 | }, 9 | "gpu_type_from_node_name": { 10 | "NODE\\d{2}$": "tesla" 11 | }, 12 | "cpu_image": "garrett4wade/real-cpu", 13 | "gpu_image": "garrett4wade/real-gpu", 14 | "node_name_prefix": "NODE" 15 | } -------------------------------------------------------------------------------- /examples/customized_exp/ppo_ref_ema.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import dataclasses 3 | import math 4 | import pprint 5 | from typing import * 6 | 7 | from realhf.api.core.dfg import ParamReallocHook 8 | from realhf.api.core.system_api import ExperimentConfig 9 | from realhf.api.quickstart.entrypoint import register_quickstart_exp 10 | from realhf.apps.quickstart import main 11 | from realhf.experiments.common.ppo_exp import PPOConfig 12 | from realhf.experiments.common.utils import resolve_replica_ids, resolve_rpc_hooks 13 | 14 | 15 | @dataclasses.dataclass 16 | class PPORefEMAConfig(PPOConfig): 17 | ref_ema_eta: float = 0.001 18 | 19 | def initial_setup(self) -> ExperimentConfig: 20 | rpc_allocs = self._get_rpc_allocations() 21 | 22 | resolve_replica_ids(rpc_allocs) 23 | resolve_rpc_hooks( 24 | rpc_allocs, self.models 25 | ) # inplace modify MFCDefs in rpc allocations 26 | 27 | pprint.pprint(rpc_allocs) 28 | 29 | ######### The main difference from normal PPO ######### 30 | def _find_rpc(name): 31 | return next(alloc.rpc for alloc in rpc_allocs if alloc.rpc.name == name) 32 | 33 | # Remove the offload hook of ref_inf, because 34 | # we need to receive parameters from peer GPUs and update it immediately. 35 | ref_inf = _find_rpc("ref_inf") 36 | ref_inf._post_hooks = [] 37 | 38 | # Add an unidirectional parameter reallocation hook. 39 | actor_train = _find_rpc("actor_train") 40 | actor_train.add_post_hook( 41 | ParamReallocHook( 42 | target=ref_inf.model_name, 43 | eta=self.ref_ema_eta, 44 | ) 45 | ) 46 | ######### The main difference from normal PPO ######### 47 | 48 | model_worker = self._get_model_worker_configs(rpc_allocs) 49 | 50 | return ExperimentConfig( 51 | exp_ctrl=self.exp_ctrl, 52 | model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs], 53 | model_worker=model_worker, 54 | ) 55 | 56 | 57 | register_quickstart_exp("ppo-ref-ema", PPORefEMAConfig) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /examples/customized_exp/scripts/run_ppo_ref_ema.sh: -------------------------------------------------------------------------------- 1 | MODEL_FAMILY=gpt2 2 | 3 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/ 4 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY/default/epoch1epochstep15globalstep15/ 5 | 6 | MODE=local 7 | EXP_NAME=quickstart-ppo 8 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 9 | 10 | unset CLUSTER_SPEC_PATH 11 | python3 examples/customized_exp/ppo_ref_ema.py ppo-ref-ema \ 12 | mode=$MODE \ 13 | experiment_name=$EXP_NAME \ 14 | trial_name=$TRIAL_NAME \ 15 | exp_ctrl.total_train_epochs=1 \ 16 | exp_ctrl.save_freq_steps=null \ 17 | actor.type._class=$MODEL_FAMILY \ 18 | actor.path=$SFT_MODEL_PATH \ 19 | actor.optimizer.lr_scheduler_type=constant \ 20 | actor.optimizer.lr=1e-4 \ 21 | actor.optimizer.warmup_steps_proportion=0.0 \ 22 | critic.type._class=$MODEL_FAMILY \ 23 | critic.type.is_critic=True \ 24 | critic.path=$RW_MODEL_PATH \ 25 | ref.type._class=$MODEL_FAMILY \ 26 | ref.path=$SFT_MODEL_PATH \ 27 | rew.type._class=$MODEL_FAMILY \ 28 | rew.type.is_critic=True \ 29 | rew.path=$RW_MODEL_PATH \ 30 | dataset.path=.data/ppo_prompt.jsonl \ 31 | dataset.max_prompt_len=128 \ 32 | dataset.train_bs_n_seqs=128 \ 33 | ppo.gen.max_new_tokens=512 \ 34 | ppo.gen.min_new_tokens=512 \ 35 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 36 | ppo.ppo_n_minibatches=4 \ 37 | ppo.kl_ctl=0.1 \ 38 | ppo.value_eps_clip=0.2 \ 39 | ppo.reward_output_scaling=10.0 \ 40 | ppo.adv_norm=True ppo.value_norm=True \ 41 | allocation_mode=manual \ 42 | n_nodes=1 \ 43 | nodelist=\'NODE01\' \ 44 | actor_train.device_mesh=\'NODE01:0,1,2,3\' \ 45 | actor_train.parallel.data_parallel_size=2 \ 46 | actor_train.parallel.model_parallel_size=1 \ 47 | actor_train.parallel.pipeline_parallel_size=2 \ 48 | actor_gen.device_mesh=\'NODE01:0,1,2,3,4,5,6,7\' \ 49 | actor_gen.parallel.data_parallel_size=4 \ 50 | actor_gen.parallel.model_parallel_size=1 \ 51 | actor_gen.parallel.pipeline_parallel_size=2 \ 52 | critic_train.device_mesh=\'NODE01:4,5,6,7\' \ 53 | critic_train.parallel.data_parallel_size=2 \ 54 | critic_train.parallel.model_parallel_size=1 \ 55 | critic_train.parallel.pipeline_parallel_size=2 \ 56 | critic_inf.device_mesh=\'NODE01:0,1\' \ 57 | critic_inf.parallel.data_parallel_size=2 \ 58 | critic_inf.parallel.model_parallel_size=1 \ 59 | critic_inf.parallel.pipeline_parallel_size=1 \ 60 | rew_inf.device_mesh=\'NODE01:2,3\' \ 61 | rew_inf.parallel.data_parallel_size=1 \ 62 | rew_inf.parallel.model_parallel_size=1 \ 63 | rew_inf.parallel.pipeline_parallel_size=2 \ 64 | ref_inf.device_mesh=\'NODE01:4,5,6,7\' \ 65 | ref_inf.parallel.data_parallel_size=1 \ 66 | ref_inf.parallel.model_parallel_size=1 \ 67 | ref_inf.parallel.pipeline_parallel_size=4 68 | -------------------------------------------------------------------------------- /examples/customized_exp/scripts/run_ppo_sentiment.sh: -------------------------------------------------------------------------------- 1 | MODEL_FAMILY=gpt2 2 | 3 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/ 4 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY/default/epoch1epochstep15globalstep15/ 5 | 6 | MODE=local 7 | EXP_NAME=quickstart-ppo 8 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 9 | 10 | unset CLUSTER_SPEC_PATH 11 | python3 examples/customized_exp/ppo_sentiment.py my-ppo \ 12 | mode=$MODE \ 13 | experiment_name=$EXP_NAME \ 14 | trial_name=$TRIAL_NAME \ 15 | exp_ctrl.total_train_epochs=1 \ 16 | exp_ctrl.save_freq_steps=null \ 17 | actor.type._class=$MODEL_FAMILY \ 18 | actor.path=$SFT_MODEL_PATH \ 19 | actor.optimizer.lr_scheduler_type=constant \ 20 | actor.optimizer.lr=1e-4 \ 21 | actor.optimizer.warmup_steps_proportion=0.0 \ 22 | critic.type._class=$MODEL_FAMILY \ 23 | critic.type.is_critic=True \ 24 | critic.path=$RW_MODEL_PATH \ 25 | ref.type._class=$MODEL_FAMILY \ 26 | ref.path=$SFT_MODEL_PATH \ 27 | rew.type._class=$MODEL_FAMILY \ 28 | rew.type.is_critic=True \ 29 | rew.path=$RW_MODEL_PATH \ 30 | dataset.path=.data/ppo_prompt.jsonl \ 31 | dataset.max_prompt_len=128 \ 32 | dataset.train_bs_n_seqs=128 \ 33 | ppo.gen.max_new_tokens=512 \ 34 | ppo.gen.min_new_tokens=512 \ 35 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 36 | ppo.ppo_n_minibatches=4 \ 37 | ppo.kl_ctl=0.1 \ 38 | ppo.value_eps_clip=0.2 \ 39 | ppo.reward_output_scaling=10.0 \ 40 | ppo.adv_norm=True ppo.value_norm=True \ 41 | allocation_mode=manual \ 42 | n_nodes=1 \ 43 | nodelist=\'NODE01\' \ 44 | actor_train.device_mesh=\'NODE01:0,1,2,3\' \ 45 | actor_train.parallel.data_parallel_size=2 \ 46 | actor_train.parallel.model_parallel_size=1 \ 47 | actor_train.parallel.pipeline_parallel_size=2 \ 48 | actor_gen.device_mesh=\'NODE01:0,1,2,3,4,5,6,7\' \ 49 | actor_gen.parallel.data_parallel_size=4 \ 50 | actor_gen.parallel.model_parallel_size=1 \ 51 | actor_gen.parallel.pipeline_parallel_size=2 \ 52 | critic_train.device_mesh=\'NODE01:4,5,6,7\' \ 53 | critic_train.parallel.data_parallel_size=2 \ 54 | critic_train.parallel.model_parallel_size=1 \ 55 | critic_train.parallel.pipeline_parallel_size=2 \ 56 | critic_inf.device_mesh=\'NODE01:0,1\' \ 57 | critic_inf.parallel.data_parallel_size=2 \ 58 | critic_inf.parallel.model_parallel_size=1 \ 59 | critic_inf.parallel.pipeline_parallel_size=1 \ 60 | rew_inf.device_mesh=\'NODE01:2,3\' \ 61 | rew_inf.parallel.data_parallel_size=2 \ 62 | rew_inf.parallel.model_parallel_size=1 \ 63 | rew_inf.parallel.pipeline_parallel_size=1 \ 64 | ref_inf.device_mesh=\'NODE01:4,5,6,7\' \ 65 | ref_inf.parallel.data_parallel_size=1 \ 66 | ref_inf.parallel.model_parallel_size=1 \ 67 | ref_inf.parallel.pipeline_parallel_size=4 68 | -------------------------------------------------------------------------------- /examples/load_and_eval_rw.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | import transformers 4 | 5 | from realhf.api.core.config import ModelName 6 | from realhf.api.core.model_api import ReaLModelConfig 7 | from realhf.base import constants 8 | from realhf.base.testing import init_global_constants 9 | 10 | 11 | def load_and_use_single_process(path: str, model_family_name: str): 12 | # Initialize distributed environment. 13 | dist.init_process_group( 14 | "nccl", rank=0, world_size=1, init_method="tcp://localhost:7777" 15 | ) 16 | model_name = ModelName("default", 0) 17 | init_global_constants( 18 | num_dp=1, 19 | num_mp=1, 20 | num_pp=1, 21 | sequence_parallel=False, 22 | model_name=model_name, 23 | ) 24 | 25 | # NOTE: import here to avoid CUDA re-initialization 26 | from realhf.impl.model.nn.real_llm_api import ReaLModel, add_helper_functions 27 | 28 | # Call a method like `config_from_llama` to get the config. 29 | mconfig: ReaLModelConfig = getattr(ReaLModel, f"config_from_{model_family_name}")( 30 | transformers.AutoConfig.from_pretrained(path) 31 | ) 32 | # IMPORTANT: Set the critic flag to True. 33 | # Since the output head and the token embedding no long have the same shape, 34 | # We set tied_embedding to be False. 35 | mconfig.is_critic = True 36 | mconfig.tied_embedding = False 37 | 38 | with constants.model_scope(model_name): 39 | # Construct the model. 40 | model = ReaLModel(mconfig, dtype=torch.float16, device="cuda") 41 | model.instantiate() 42 | 43 | # Load the reward checkpoint 44 | # Since the checkpoint is already critic model, we set 45 | # init_critic_from_actor to be False. 46 | model = getattr(model, f"from_{model_family_name}")( 47 | path, init_critic_from_actor=False 48 | ) 49 | # Add helper functions to make the model like HuggingFace models. 50 | add_helper_functions(model) 51 | 52 | # Use the model. 53 | bs = 10 54 | seqlen = 256 55 | input_ids = torch.randint( 56 | 0, mconfig.vocab_size, (bs, seqlen), dtype=torch.long, device="cuda" 57 | ) 58 | attention_mask = torch.ones_like(input_ids, dtype=torch.bool) 59 | 60 | # The final dimension of the output scores is 1. 61 | scores = model(input_ids, attention_mask).logits 62 | assert scores.shape == (bs, seqlen, 1), scores.shape 63 | 64 | 65 | if __name__ == "__main__": 66 | path = "/lustre/aigc/llm/checkpoints/fw/quickstart-rw/llama-ray-manual/default/epoch1epochstep10globalstep10/" 67 | model_family_name = "llama" 68 | load_and_use_single_process(path, model_family_name) 69 | -------------------------------------------------------------------------------- /examples/new_algorithms/grpo/grpo.sh: -------------------------------------------------------------------------------- 1 | MODEL_FAMILY=llama 2 | 3 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 4 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/ 5 | 6 | MODE=local 7 | 8 | EXP_NAME=quickstart-grpo 9 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 10 | 11 | python3 examples/new_algorithms/grpo/grpo_exp.py grpo \ 12 | mode=$MODE \ 13 | experiment_name=$EXP_NAME \ 14 | trial_name=$TRIAL_NAME \ 15 | exp_ctrl.total_train_epochs=8 \ 16 | exp_ctrl.save_freq_steps=null \ 17 | actor.type._class=$MODEL_FAMILY \ 18 | actor.path=$SFT_MODEL_PATH \ 19 | actor.optimizer.lr=1e-4 \ 20 | actor.optimizer.lr_scheduler_type=constant \ 21 | rew.type._class=$MODEL_FAMILY \ 22 | rew.type.is_critic=True \ 23 | rew.path=$RW_MODEL_PATH \ 24 | ref.type._class=$MODEL_FAMILY \ 25 | ref.path=$SFT_MODEL_PATH \ 26 | dataset.path=.data/ppo_prompt.jsonl \ 27 | dataset.max_prompt_len=128 \ 28 | dataset.train_bs_n_seqs=32 \ 29 | allocation_mode=heuristic \ 30 | n_nodes=1 \ 31 | ppo.gen.max_new_tokens=512 \ 32 | ppo.gen.min_new_tokens=512 \ 33 | ppo.gen.use_cuda_graph=True \ 34 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 35 | ppo.ppo_n_minibatches=4 \ 36 | ppo.reward_output_scaling=1.0 ppo.adv_norm=False 37 | -------------------------------------------------------------------------------- /examples/new_algorithms/reinforce/reinforce.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | # You can specify different model families for the SFT and the RW model, but you need to 3 | # re-tokenize the sequences if necessary. 4 | MODEL_FAMILY=gpt2 5 | 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints. 7 | # ReaL saves checkpoints with the same format as HuggingFace, 8 | # so you don't need to convert or split checkpoints explicitly. 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this 10 | # will not ensure the optimal algorithm performance. 11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/ 12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY/default/epoch1epochstep15globalstep15/ 13 | 14 | # Option 1: The experiment runs locally with subprocesses. 15 | # MODE=local 16 | # Option 2: The experiment runs in a Ray cluster 17 | MODE=local 18 | # Option 3: The experiment runs in a SLURM + pyxis cluster 19 | # Using the slurm mode requires a cluster spec file 20 | # and setting CLUSTER_SPEC_PATH to the path of it. 21 | # MODE=slurm 22 | 23 | # `experiment_name` and `trial_name` can be arbitrary. 24 | # Logs and saved checkpoints will be indexed by them. 25 | EXP_NAME=quickstart-reinforce 26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 27 | 28 | # When using the "manual" allocation mode, the user should specify the device allocation 29 | # and parallel strategies for each model function calls. 30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8). 31 | # We provide a template in the following command and the user can modify it according to 32 | # the specific model and the available GPUs. 33 | 34 | # The following command shows an example of manual allocation on two nodes, 35 | # but it can be modified according to the specific model and the available GPUs. 36 | unset CLUSTER_SPEC_PATH 37 | python3 examples/new_algorithms/reinforce/reinforce_exp.py reinforce \ 38 | mode=$MODE \ 39 | experiment_name=$EXP_NAME \ 40 | trial_name=$TRIAL_NAME \ 41 | exp_ctrl.total_train_epochs=8 \ 42 | exp_ctrl.save_freq_steps=null \ 43 | actor.type._class=$MODEL_FAMILY \ 44 | actor.path=$SFT_MODEL_PATH \ 45 | actor.optimizer.lr=1e-4 \ 46 | actor.optimizer.lr_scheduler_type=constant \ 47 | rew.type._class=$MODEL_FAMILY \ 48 | rew.type.is_critic=True \ 49 | rew.path=$RW_MODEL_PATH \ 50 | dataset.path=.data/ppo_prompt.jsonl \ 51 | dataset.max_prompt_len=128 \ 52 | dataset.train_bs_n_seqs=512 \ 53 | gen.max_new_tokens=512 \ 54 | gen.min_new_tokens=512 \ 55 | gen.use_cuda_graph=True \ 56 | gen.top_p=0.9 gen.top_k=5000 \ 57 | allocation_mode=manual \ 58 | n_nodes=1 \ 59 | nodelist=\'NODE01\' \ 60 | actor_train.device_mesh=\'NODE01:0,1,2,3,4,5,6,7\' \ 61 | actor_train.parallel.data_parallel_size=4 \ 62 | actor_train.parallel.model_parallel_size=1 \ 63 | actor_train.parallel.pipeline_parallel_size=2 \ 64 | sample_gen.device_mesh=\'NODE01:0,1,2,3\' \ 65 | sample_gen.parallel.data_parallel_size=2 \ 66 | sample_gen.parallel.model_parallel_size=1 \ 67 | sample_gen.parallel.pipeline_parallel_size=2 \ 68 | sample_rew_inf.device_mesh=\'NODE01:0,1,2,3\' \ 69 | sample_rew_inf.parallel.data_parallel_size=4 \ 70 | sample_rew_inf.parallel.model_parallel_size=1 \ 71 | sample_rew_inf.parallel.pipeline_parallel_size=1 \ 72 | greedy_gen.device_mesh=\'NODE01:4,5,6,7\' \ 73 | greedy_gen.parallel.data_parallel_size=2 \ 74 | greedy_gen.parallel.model_parallel_size=1 \ 75 | greedy_gen.parallel.pipeline_parallel_size=2 \ 76 | greedy_rew_inf.device_mesh=\'NODE01:4,5,6,7\' \ 77 | greedy_rew_inf.parallel.data_parallel_size=4 \ 78 | greedy_rew_inf.parallel.model_parallel_size=1 \ 79 | greedy_rew_inf.parallel.pipeline_parallel_size=1 80 | -------------------------------------------------------------------------------- /examples/profiling/allocations.jsonl: -------------------------------------------------------------------------------- 1 | {"data_parallel_size": 2, "model_parallel_size": 4, "pipeline_parallel_size": 1, "use_sequence_parallel": true} -------------------------------------------------------------------------------- /examples/profiling/datasets.jsonl: -------------------------------------------------------------------------------- 1 | {"type_": "prompt", "args": {"max_length": 1024, "pad_to_max_length": true, "dataset_path": "/lustre/fw/datasets/imdb/rl/ppo_prompt.jsonl"}} -------------------------------------------------------------------------------- /examples/profiling/interfaces.jsonl: -------------------------------------------------------------------------------- 1 | {"type_": "ppo_actor", "args": {"generation_config": {"max_new_tokens": 1024,"min_new_tokens": 1024,"use_cuda_graph": true,"force_no_logits_mask": true,"force_cudagraph_recapture": true,"top_p": 1.0,"top_k": 1000000},"enable_save": false,"n_minibatches": 8}} 2 | -------------------------------------------------------------------------------- /examples/profiling/models.jsonl: -------------------------------------------------------------------------------- 1 | {"type": {"_class": "llama"}, "path": "/lustre/public/pretrained_model_weights/Llama-2-7b-hf"} -------------------------------------------------------------------------------- /examples/profiling/profile.sh: -------------------------------------------------------------------------------- 1 | # The model to profile and its path. 2 | MODEL_FAMILY=llama 3 | SFT_MODEL_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 4 | 5 | EXP_NAME=profile-example 6 | TRIAL_NAME=test 7 | 8 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json" 9 | 10 | # Setting REAL_DUMP_TRACE=1 enables execution trace provided by PyTorch. 11 | 12 | # Setting REAL_DUMP_MEMORY=1 enables memory profiling provided by PyTorch. 13 | 14 | # The dataset content doesn't matter, as long as it is a prompt-only dataset. 15 | # Each entry in the dataset should contain two keys "id" and "prompt". 16 | # By default we pad the prompt to the maximum length in the batch for accurate system-wise benchmark. 17 | # The loaded data will be processed by the "_mock_${handle_name}" method in the interface 18 | # to create mock data suited for the exact interface handle. 19 | 20 | # "handle_name" can be "inference", "generate", or "train_step", 21 | # and the "interface_impl" specifies which registered interface implementation to run. 22 | # "interface_kwargs_json" is a JSON configuration of the interface. 23 | 24 | # "allocations_jsonl" is a JSONL file that specifies the parallel strategies to profile. 25 | # If not specified, all parallel strategies under the given world size will be profiled. 26 | 27 | # "n_mbs" specifies the number of micro-batches to profile. 28 | 29 | # The total number of runs will be the product of the number of micro-batches and the number of parallel strategies, 30 | # all within the same experiment_name and trial_name. Instead of re-launching the whole experiment, workers will 31 | # be paused and reconfigured to run the next experiment setup. 32 | 33 | REAL_DUMP_TRACE=1 REAL_DUMP_MEMORY=1 \ 34 | python3 -m realhf.apps.quickstart profile \ 35 | mode=local \ 36 | experiment_name=$EXP_NAME \ 37 | trial_name=$TRIAL_NAME \ 38 | exp_ctrl.benchmark_steps=3 \ 39 | exp_ctrl.save_freq_steps=null \ 40 | exp_ctrl.eval_freq_steps=null \ 41 | n_nodes=1 \ 42 | 'handle_names=[train_step]' \ 43 | interfaces_jsonl=./examples/profiling/interfaces.jsonl \ 44 | models_jsonl=./examples/profiling/models.jsonl \ 45 | datasets_jsonl=./examples/profiling/datasets.jsonl \ 46 | allocations_jsonl=./examples/profiling/allocations.jsonl \ 47 | 'n_mbs=[1, 2, 4]' \ 48 | 'batch_sizes=[512]' 49 | -------------------------------------------------------------------------------- /examples/scripts/distributed_ray/dpo.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=gpt2 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint. 5 | # The latter is the common practice. 6 | # ReaL saves checkpoints with the same format as HuggingFace, 7 | # so you don't need to convert or split checkpoints explicitly. 8 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/ 9 | 10 | # Option 1: The experiment runs locally with subprocesses. 11 | # MODE=local 12 | # Option 2: The experiment runs in a Ray cluster 13 | MODE=ray 14 | # Option 3: The experiment runs in a SLURM + pyxis cluster 15 | # Using the slurm mode requires a cluster spec file 16 | # and setting CLUSTER_SPEC_PATH to the path of it. 17 | # MODE=slurm 18 | 19 | # `experiment_name` and `trial_name` can be arbitrary. 20 | # Logs and saved checkpoints will be indexed by them. 21 | EXP_NAME=quickstart-dpo 22 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 23 | 24 | # We use the "manual" allocation mode here to manually specify the parallelism strategy of training 25 | # and inference. The parallel strategy for training prefers tensor-model parallelism while the 26 | # inference prefers pipeline parallelism, which are more efficient for their corresponding workloads. 27 | 28 | # The `dpo` subcommand specifies that this is a DPO experiment. 29 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 30 | # Enable it if you want to save checkpoints. 31 | python3 -m realhf.apps.quickstart dpo \ 32 | mode=$MODE \ 33 | experiment_name=$EXP_NAME \ 34 | trial_name=$TRIAL_NAME \ 35 | exp_ctrl.total_train_epochs=2 \ 36 | exp_ctrl.save_freq_steps=null \ 37 | n_nodes=2 \ 38 | allocation_mode=manual \ 39 | actor.type._class=$MODEL_FAMILY \ 40 | actor.path=$PRETRAINED_PATH \ 41 | actor_train.parallel.pipeline_parallel_size=4 \ 42 | actor_train.parallel.model_parallel_size=1 \ 43 | actor_train.parallel.data_parallel_size=4 \ 44 | actor_train.parallel.use_sequence_parallel=True \ 45 | ref.type._class=$MODEL_FAMILY \ 46 | ref.path=$PRETRAINED_PATH \ 47 | ref_inf.parallel.pipeline_parallel_size=4 \ 48 | ref_inf.parallel.model_parallel_size=1 \ 49 | ref_inf.parallel.data_parallel_size=4 \ 50 | ref_inf.parallel.use_sequence_parallel=True \ 51 | dataset.train_path=.data/rm_paired-train.jsonl \ 52 | dataset.max_pairs_per_prompt=2 \ 53 | dataset.max_seqlen=1024 \ 54 | dataset.train_bs_n_seqs=512 \ 55 | dataset.valid_bs_n_seqs=512 -------------------------------------------------------------------------------- /examples/scripts/distributed_ray/ppo.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | # You can specify different model families for the SFT and the RW model, but you need to 3 | # re-tokenize the sequences if necessary. 4 | MODEL_FAMILY=llama 5 | 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints. 7 | # ReaL saves checkpoints with the same format as HuggingFace, 8 | # so you don't need to convert or split checkpoints explicitly. 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this 10 | # will not ensure the optimal algorithm performance. 11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/ 13 | 14 | # Option 1: The experiment runs locally with subprocesses. 15 | # MODE=local 16 | # Option 2: The experiment runs in a Ray cluster 17 | MODE=ray 18 | # Option 3: The experiment runs in a SLURM + pyxis cluster 19 | # Using the slurm mode requires a cluster spec file 20 | # and setting CLUSTER_SPEC_PATH to the path of it. 21 | # MODE=slurm 22 | 23 | # `experiment_name` and `trial_name` can be arbitrary. 24 | # Logs and saved checkpoints will be indexed by them. 25 | EXP_NAME=quickstart-ppo 26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic 27 | 28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy 29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc. 30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8). 31 | # ReaL will make full use of these available GPUs to design allocations. 32 | # This does not ensure the optimal throughput, but it is a good starting point. 33 | 34 | # The `heuristic` allocation mode is not ensured to run with every model configurations. 35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work. 36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually. 37 | 38 | # The `ppo` subcommand specifies that this is a PPO experiment. 39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 40 | # Enable it if you want to save checkpoints. 41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters. 42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters. 43 | # It's the user's responsibility to tune them appropriately. 44 | python3 -m realhf.apps.quickstart ppo \ 45 | mode=$MODE \ 46 | experiment_name=$EXP_NAME \ 47 | trial_name=$TRIAL_NAME \ 48 | exp_ctrl.total_train_epochs=1 \ 49 | exp_ctrl.save_freq_steps=null \ 50 | n_nodes=4 \ 51 | allocation_mode=heuristic \ 52 | actor.type._class=$MODEL_FAMILY \ 53 | actor.path=$SFT_MODEL_PATH \ 54 | critic.type._class=$MODEL_FAMILY \ 55 | critic.type.is_critic=True \ 56 | critic.path=$RW_MODEL_PATH \ 57 | ref.type._class=$MODEL_FAMILY \ 58 | ref.path=$SFT_MODEL_PATH \ 59 | rew.type._class=$MODEL_FAMILY \ 60 | rew.type.is_critic=True \ 61 | rew.path=$RW_MODEL_PATH \ 62 | dataset.path=.data/ppo_prompt.jsonl \ 63 | dataset.max_prompt_len=128 \ 64 | dataset.train_bs_n_seqs=128 \ 65 | ppo.gen.max_new_tokens=512 \ 66 | ppo.gen.min_new_tokens=512 \ 67 | ppo.gen.use_cuda_graph=True \ 68 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 69 | ppo.ppo_n_minibatches=4 \ 70 | ppo.kl_ctl=0.1 \ 71 | ppo.value_eps_clip=0.2 \ 72 | ppo.reward_output_scaling=1.0 \ 73 | ppo.adv_norm=True ppo.value_norm=True -------------------------------------------------------------------------------- /examples/scripts/distributed_ray/rw.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=llama 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint. 5 | # The latter is the common practice. 6 | # ReaL saves checkpoints with the same format as HuggingFace, 7 | # so you don't need to convert or split checkpoints explicitly. 8 | # HF pretrained checkpoint 9 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 10 | # or SFT checkpoint 11 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/llama-local-manual/default/epoch7epochstep5globalstep50/ 12 | 13 | # Option 1: The experiment runs locally with subprocesses. 14 | # MODE=local 15 | # Option 2: The experiment runs in a Ray cluster 16 | MODE=ray 17 | # Option 3: The experiment runs in a SLURM + pyxis cluster 18 | # Using the slurm mode requires a cluster spec file 19 | # and setting CLUSTER_SPEC_PATH to the path of it. 20 | # MODE=slurm 21 | 22 | # `experiment_name` and `trial_name` can be arbitrary. 23 | # Logs and saved checkpoints will be indexed by them. 24 | EXP_NAME=quickstart-rw 25 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 26 | 27 | # We use the "manual" allocation mode here to manually specify the parallelism strategy, 28 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs. 29 | 30 | # The `rw` subcommand specifies that this is a reward modeling experiment. 31 | # The reward modeling experiment converges very fast, so we set a smaller 32 | # `total_train_epochs` and `save_freq_steps` for demonstration. 33 | # Note that we set `model.type.is_critic=True` to initialize a reward model from the LLM 34 | # by re-initializing the LM head. 35 | python3 -m realhf.apps.quickstart rw \ 36 | mode=$MODE \ 37 | experiment_name=$EXP_NAME \ 38 | trial_name=$TRIAL_NAME \ 39 | exp_ctrl.total_train_epochs=2 \ 40 | exp_ctrl.save_freq_steps=10 \ 41 | exp_ctrl.eval_freq_epochs=1 \ 42 | model.optimizer.type=adam \ 43 | model.optimizer.lr_scheduler_type=cosine \ 44 | model.optimizer.lr=1e-5 \ 45 | model.optimizer.warmup_steps_proportion=0.02 \ 46 | model.type._class=$MODEL_FAMILY \ 47 | model.type.is_critic=True \ 48 | model.path=$PRETRAINED_PATH \ 49 | dataset.train_path=.data/rm_paired-train.jsonl \ 50 | dataset.valid_path=.data/rm_paired-valid.jsonl \ 51 | dataset.max_seqlen=1024 \ 52 | dataset.train_bs_n_seqs=512 \ 53 | dataset.valid_bs_n_seqs=512 \ 54 | allocation_mode=manual \ 55 | n_nodes=2 \ 56 | allocation.parallel.pipeline_parallel_size=2 \ 57 | allocation.parallel.model_parallel_size=2 \ 58 | allocation.parallel.data_parallel_size=4 \ 59 | allocation.parallel.use_sequence_parallel=True -------------------------------------------------------------------------------- /examples/scripts/distributed_ray/sft.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=llama 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint. 5 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 6 | 7 | # Option 1: The experiment runs locally with subprocesses. 8 | # MODE=local 9 | # Option 2: The experiment runs in a Ray cluster 10 | MODE=ray 11 | # Option 3: The experiment runs in a SLURM + pyxis cluster 12 | # Using the slurm mode requires a cluster spec file 13 | # and setting CLUSTER_SPEC_PATH to the path of it. 14 | # MODE=slurm 15 | 16 | # `experiment_name` and `trial_name` can be arbitrary. 17 | # Logs and saved checkpoints will be indexed by them. 18 | EXP_NAME=quickstart-sft 19 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 20 | 21 | # We use the "manual" allocation mode here to manually specify the parallelism strategy, 22 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs. 23 | 24 | # The `sft` subcommand specifies that this is a supervised fine-tuning experiment. 25 | python3 -m realhf.apps.quickstart sft \ 26 | mode=$MODE \ 27 | experiment_name=$EXP_NAME \ 28 | trial_name=$TRIAL_NAME \ 29 | exp_ctrl.total_train_epochs=8 \ 30 | exp_ctrl.save_freq_steps=50 \ 31 | exp_ctrl.eval_freq_epochs=1 \ 32 | model.optimizer.type=adam \ 33 | model.optimizer.lr_scheduler_type=cosine \ 34 | model.optimizer.lr=1e-5 \ 35 | model.optimizer.warmup_steps_proportion=0.02 \ 36 | model.type._class=$MODEL_FAMILY \ 37 | model.path=$PRETRAINED_PATH \ 38 | dataset.train_path=.data/sft_pos-train.jsonl \ 39 | dataset.valid_path=.data/sft_pos-train.jsonl \ 40 | dataset.max_seqlen=1024 \ 41 | dataset.train_bs_n_seqs=512 \ 42 | dataset.valid_bs_n_seqs=512 \ 43 | allocation_mode=manual \ 44 | n_nodes=4 \ 45 | allocation.parallel.pipeline_parallel_size=2 \ 46 | allocation.parallel.model_parallel_size=4 \ 47 | allocation.parallel.data_parallel_size=4 \ 48 | allocation.parallel.use_sequence_parallel=True -------------------------------------------------------------------------------- /examples/scripts/distributed_slurm/dpo.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=gpt2 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint. 5 | # The latter is the common practice. 6 | # ReaL saves checkpoints with the same format as HuggingFace, 7 | # so you don't need to convert or split checkpoints explicitly. 8 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/ 9 | 10 | # Option 1: The experiment runs locally with subprocesses. 11 | # MODE=local 12 | # Option 2: The experiment runs in a Ray cluster 13 | # MODE=ray 14 | # Option 3: The experiment runs in a SLURM + pyxis cluster 15 | # Using the slurm mode requires a cluster spec file 16 | # and setting CLUSTER_SPEC_PATH to the path of it. 17 | MODE=slurm 18 | 19 | # `experiment_name` and `trial_name` can be arbitrary. 20 | # Logs and saved checkpoints will be indexed by them. 21 | EXP_NAME=quickstart-dpo 22 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 23 | 24 | # We use the "manual" allocation mode here to manually specify the parallelism strategy of training 25 | # and inference. The parallel strategy for training prefers tensor-model parallelism while the 26 | # inference prefers pipeline parallelism, which are more efficient for their corresponding workloads. 27 | 28 | # The `dpo` subcommand specifies that this is a DPO experiment. 29 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 30 | # Enable it if you want to save checkpoints. 31 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json" 32 | python3 -m realhf.apps.quickstart dpo \ 33 | mode=$MODE \ 34 | experiment_name=$EXP_NAME \ 35 | trial_name=$TRIAL_NAME \ 36 | exp_ctrl.total_train_epochs=2 \ 37 | exp_ctrl.save_freq_steps=null \ 38 | n_nodes=2 \ 39 | allocation_mode=manual \ 40 | actor.type._class=$MODEL_FAMILY \ 41 | actor.path=$PRETRAINED_PATH \ 42 | actor_train.parallel.pipeline_parallel_size=4 \ 43 | actor_train.parallel.model_parallel_size=1 \ 44 | actor_train.parallel.data_parallel_size=4 \ 45 | actor_train.parallel.use_sequence_parallel=True \ 46 | ref.type._class=$MODEL_FAMILY \ 47 | ref.path=$PRETRAINED_PATH \ 48 | ref_inf.parallel.pipeline_parallel_size=4 \ 49 | ref_inf.parallel.model_parallel_size=1 \ 50 | ref_inf.parallel.data_parallel_size=4 \ 51 | ref_inf.parallel.use_sequence_parallel=True \ 52 | dataset.train_path=.data/rm_paired-train.jsonl \ 53 | dataset.max_pairs_per_prompt=2 \ 54 | dataset.max_seqlen=1024 \ 55 | dataset.train_bs_n_seqs=512 \ 56 | dataset.valid_bs_n_seqs=512 -------------------------------------------------------------------------------- /examples/scripts/distributed_slurm/ppo.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | # You can specify different model families for the SFT and the RW model, but you need to 3 | # re-tokenize the sequences if necessary. 4 | MODEL_FAMILY=llama 5 | 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints. 7 | # ReaL saves checkpoints with the same format as HuggingFace, 8 | # so you don't need to convert or split checkpoints explicitly. 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this 10 | # will not ensure the optimal algorithm performance. 11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/ 13 | 14 | # Option 1: The experiment runs locally with subprocesses. 15 | # MODE=local 16 | # Option 2: The experiment runs in a Ray cluster 17 | # MODE=ray 18 | # Option 3: The experiment runs in a SLURM + pyxis cluster 19 | # Using the slurm mode requires a cluster spec file 20 | # and setting CLUSTER_SPEC_PATH to the path of it. 21 | MODE=slurm 22 | 23 | # `experiment_name` and `trial_name` can be arbitrary. 24 | # Logs and saved checkpoints will be indexed by them. 25 | EXP_NAME=quickstart-ppo 26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic 27 | 28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy 29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc. 30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8). 31 | # ReaL will make full use of these available GPUs to design allocations. 32 | # This does not ensure the optimal throughput, but it is a good starting point. 33 | 34 | # The `heuristic` allocation mode is not ensured to run with every model configurations. 35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work. 36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually. 37 | 38 | # The `ppo` subcommand specifies that this is a PPO experiment. 39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 40 | # Enable it if you want to save checkpoints. 41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters. 42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters. 43 | # It's the user's responsibility to tune them appropriately. 44 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json" 45 | python3 -m realhf.apps.quickstart ppo \ 46 | mode=$MODE \ 47 | experiment_name=$EXP_NAME \ 48 | trial_name=$TRIAL_NAME \ 49 | exp_ctrl.total_train_epochs=1 \ 50 | exp_ctrl.save_freq_steps=null \ 51 | n_nodes=4 \ 52 | allocation_mode=heuristic \ 53 | actor.type._class=$MODEL_FAMILY \ 54 | actor.path=$SFT_MODEL_PATH \ 55 | critic.type._class=$MODEL_FAMILY \ 56 | critic.type.is_critic=True \ 57 | critic.path=$RW_MODEL_PATH \ 58 | ref.type._class=$MODEL_FAMILY \ 59 | ref.path=$SFT_MODEL_PATH \ 60 | rew.type._class=$MODEL_FAMILY \ 61 | rew.type.is_critic=True \ 62 | rew.path=$RW_MODEL_PATH \ 63 | dataset.path=.data/ppo_prompt.jsonl \ 64 | dataset.max_prompt_len=128 \ 65 | dataset.train_bs_n_seqs=128 \ 66 | ppo.gen.max_new_tokens=512 \ 67 | ppo.gen.min_new_tokens=512 \ 68 | ppo.gen.use_cuda_graph=True \ 69 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 70 | ppo.ppo_n_minibatches=4 \ 71 | ppo.kl_ctl=0.1 \ 72 | ppo.value_eps_clip=0.2 \ 73 | ppo.reward_output_scaling=1.0 \ 74 | ppo.adv_norm=True ppo.value_norm=True -------------------------------------------------------------------------------- /examples/scripts/distributed_slurm/rw.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=llama 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint. 5 | # The latter is the common practice. 6 | # ReaL saves checkpoints with the same format as HuggingFace, 7 | # so you don't need to convert or split checkpoints explicitly. 8 | # HF pretrained checkpoint 9 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 10 | # or SFT checkpoint 11 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/llama-local-manual/default/epoch7epochstep5globalstep50/ 12 | 13 | # Option 1: The experiment runs locally with subprocesses. 14 | # MODE=local 15 | # Option 2: The experiment runs in a Ray cluster 16 | # MODE=ray 17 | # Option 3: The experiment runs in a SLURM + pyxis cluster 18 | # Using the slurm mode requires a cluster spec file 19 | # and setting CLUSTER_SPEC_PATH to the path of it. 20 | MODE=slurm 21 | 22 | # `experiment_name` and `trial_name` can be arbitrary. 23 | # Logs and saved checkpoints will be indexed by them. 24 | EXP_NAME=quickstart-rw 25 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 26 | 27 | # We use the "manual" allocation mode here to manually specify the parallelism strategy, 28 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs. 29 | 30 | # The `rw` subcommand specifies that this is a reward modeling experiment. 31 | # The reward modeling experiment converges very fast, so we set a smaller 32 | # `total_train_epochs` and `save_freq_steps` for demonstration. 33 | # Note that we set `model.type.is_critic=True` to initialize a reward model from the LLM 34 | # by re-initializing the LM head. 35 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json" 36 | python3 -m realhf.apps.quickstart rw \ 37 | mode=$MODE \ 38 | experiment_name=$EXP_NAME \ 39 | trial_name=$TRIAL_NAME \ 40 | exp_ctrl.total_train_epochs=2 \ 41 | exp_ctrl.save_freq_steps=10 \ 42 | exp_ctrl.eval_freq_epochs=1 \ 43 | model.optimizer.type=adam \ 44 | model.optimizer.lr_scheduler_type=cosine \ 45 | model.optimizer.lr=1e-5 \ 46 | model.optimizer.warmup_steps_proportion=0.02 \ 47 | model.type._class=$MODEL_FAMILY \ 48 | model.type.is_critic=True \ 49 | model.path=$PRETRAINED_PATH \ 50 | dataset.train_path=.data/rm_paired-train.jsonl \ 51 | dataset.valid_path=.data/rm_paired-valid.jsonl \ 52 | dataset.max_seqlen=1024 \ 53 | dataset.train_bs_n_seqs=512 \ 54 | dataset.valid_bs_n_seqs=512 \ 55 | allocation_mode=manual \ 56 | n_nodes=2 \ 57 | allocation.parallel.pipeline_parallel_size=2 \ 58 | allocation.parallel.model_parallel_size=2 \ 59 | allocation.parallel.data_parallel_size=4 \ 60 | allocation.parallel.use_sequence_parallel=True -------------------------------------------------------------------------------- /examples/scripts/distributed_slurm/sft.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=llama 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint. 5 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 6 | 7 | # Option 1: The experiment runs locally with subprocesses. 8 | # MODE=local 9 | # Option 2: The experiment runs in a Ray cluster 10 | # MODE=ray 11 | # Option 3: The experiment runs in a SLURM + pyxis cluster 12 | # Using the slurm mode requires a cluster spec file 13 | # and setting CLUSTER_SPEC_PATH to the path of it. 14 | MODE=slurm 15 | 16 | # `experiment_name` and `trial_name` can be arbitrary. 17 | # Logs and saved checkpoints will be indexed by them. 18 | EXP_NAME=quickstart-sft 19 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 20 | 21 | # We use the "manual" allocation mode here to manually specify the parallelism strategy, 22 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs. 23 | 24 | # The `sft` subcommand specifies that this is a supervised fine-tuning experiment. 25 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json" 26 | python3 -m realhf.apps.quickstart sft \ 27 | mode=$MODE \ 28 | experiment_name=$EXP_NAME \ 29 | trial_name=$TRIAL_NAME \ 30 | exp_ctrl.total_train_epochs=8 \ 31 | exp_ctrl.save_freq_steps=50 \ 32 | exp_ctrl.eval_freq_epochs=1 \ 33 | model.optimizer.type=adam \ 34 | model.optimizer.lr_scheduler_type=cosine \ 35 | model.optimizer.lr=1e-5 \ 36 | model.optimizer.warmup_steps_proportion=0.02 \ 37 | model.type._class=$MODEL_FAMILY \ 38 | model.path=$PRETRAINED_PATH \ 39 | dataset.train_path=.data/sft_pos-train.jsonl \ 40 | dataset.valid_path=.data/sft_pos-train.jsonl \ 41 | dataset.max_seqlen=1024 \ 42 | dataset.train_bs_n_seqs=512 \ 43 | dataset.valid_bs_n_seqs=512 \ 44 | allocation_mode=manual \ 45 | n_nodes=4 \ 46 | allocation.parallel.pipeline_parallel_size=2 \ 47 | allocation.parallel.model_parallel_size=4 \ 48 | allocation.parallel.data_parallel_size=4 \ 49 | allocation.parallel.use_sequence_parallel=True -------------------------------------------------------------------------------- /examples/scripts/local/dpo.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=gpt2 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint. 5 | # The latter is the common practice. 6 | # ReaL saves checkpoints with the same format as HuggingFace, 7 | # so you don't need to convert or split checkpoints explicitly. 8 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/ 9 | 10 | # Option 1: The experiment runs locally with subprocesses. 11 | MODE=local 12 | # Option 2: The experiment runs in a Ray cluster 13 | # MODE=ray 14 | # Option 3: The experiment runs in a SLURM + pyxis cluster 15 | # Using the slurm mode requires a cluster spec file 16 | # and setting CLUSTER_SPEC_PATH to the path of it. 17 | # MODE=slurm 18 | 19 | # `experiment_name` and `trial_name` can be arbitrary. 20 | # Logs and saved checkpoints will be indexed by them. 21 | EXP_NAME=quickstart-dpo 22 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 23 | 24 | # We use the "manual" allocation mode here to manually specify the parallelism strategy of training 25 | # and inference. The parallel strategy for training prefers tensor-model parallelism while the 26 | # inference prefers pipeline parallelism, which are more efficient for their corresponding workloads. 27 | 28 | # The `dpo` subcommand specifies that this is a DPO experiment. 29 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 30 | # Enable it if you want to save checkpoints. 31 | python3 -m realhf.apps.quickstart dpo \ 32 | mode=$MODE \ 33 | experiment_name=$EXP_NAME \ 34 | trial_name=$TRIAL_NAME \ 35 | exp_ctrl.total_train_epochs=2 \ 36 | exp_ctrl.save_freq_steps=null \ 37 | n_nodes=1 \ 38 | allocation_mode=manual \ 39 | actor.type._class=$MODEL_FAMILY \ 40 | actor.path=$PRETRAINED_PATH \ 41 | actor_train.parallel.pipeline_parallel_size=2 \ 42 | actor_train.parallel.model_parallel_size=1 \ 43 | actor_train.parallel.data_parallel_size=4 \ 44 | actor_train.parallel.use_sequence_parallel=True \ 45 | ref.type._class=$MODEL_FAMILY \ 46 | ref.path=$PRETRAINED_PATH \ 47 | ref_inf.parallel.pipeline_parallel_size=4 \ 48 | ref_inf.parallel.model_parallel_size=1 \ 49 | ref_inf.parallel.data_parallel_size=2 \ 50 | ref_inf.parallel.use_sequence_parallel=True \ 51 | dataset.train_path=.data/rm_paired-train.jsonl \ 52 | dataset.max_pairs_per_prompt=2 \ 53 | dataset.max_seqlen=1024 \ 54 | dataset.train_bs_n_seqs=512 \ 55 | dataset.valid_bs_n_seqs=512 -------------------------------------------------------------------------------- /examples/scripts/local/gen.sh: -------------------------------------------------------------------------------- 1 | MODEL_FAMILY=llama 2 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 3 | 4 | MODE=local 5 | 6 | EXP_NAME=quickstart-gen 7 | TRIAL_NAME=$MODEL_FAMILY-$MODE 8 | 9 | python3 -m realhf.apps.quickstart gen \ 10 | mode=$MODE \ 11 | experiment_name=$EXP_NAME \ 12 | trial_name=$TRIAL_NAME \ 13 | exp_ctrl.total_train_epochs=1 \ 14 | exp_ctrl.save_freq_steps=null \ 15 | n_nodes=1 \ 16 | allocation_mode=manual \ 17 | model.type._class=$MODEL_FAMILY \ 18 | model.path=$SFT_MODEL_PATH \ 19 | dataset.path=.data/ppo_prompt.jsonl \ 20 | dataset.max_prompt_len=1024 \ 21 | dataset.train_bs_n_seqs=100 \ 22 | allocation.parallel.pipeline_parallel_size=1 \ 23 | allocation.parallel.model_parallel_size=2 \ 24 | allocation.parallel.data_parallel_size=4 \ 25 | gen.max_new_tokens=1024 \ 26 | gen.min_new_tokens=1024 \ 27 | gen.use_cuda_graph=True \ 28 | gen.top_p=0.9 gen.top_k=1000 -------------------------------------------------------------------------------- /examples/scripts/local/ppo.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | # You can specify different model families for the SFT and the RW model, but you need to 3 | # re-tokenize the sequences if necessary. 4 | MODEL_FAMILY=llama 5 | 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints. 7 | # ReaL saves checkpoints with the same format as HuggingFace, 8 | # so you don't need to convert or split checkpoints explicitly. 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this 10 | # will not ensure the optimal algorithm performance. 11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/ 13 | 14 | # Option 1: The experiment runs locally with subprocesses. 15 | MODE=local 16 | # Option 2: The experiment runs in a Ray cluster 17 | # MODE=ray 18 | # Option 3: The experiment runs in a SLURM + pyxis cluster 19 | # Using the slurm mode requires a cluster spec file 20 | # and setting CLUSTER_SPEC_PATH to the path of it. 21 | # MODE=slurm 22 | 23 | # `experiment_name` and `trial_name` can be arbitrary. 24 | # Logs and saved checkpoints will be indexed by them. 25 | EXP_NAME=quickstart-ppo 26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic 27 | 28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy 29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc. 30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8). 31 | # ReaL will make full use of these available GPUs to design allocations. 32 | # This does not ensure the optimal throughput, but it is a good starting point. 33 | 34 | # The `heuristic` allocation mode is not ensured to run with every model configurations. 35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work. 36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually. 37 | 38 | # The `ppo` subcommand specifies that this is a PPO experiment. 39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 40 | # Enable it if you want to save checkpoints. 41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters. 42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters. 43 | # It's the user's responsibility to tune them appropriately. 44 | python3 -m realhf.apps.quickstart ppo \ 45 | mode=$MODE \ 46 | experiment_name=$EXP_NAME \ 47 | trial_name=$TRIAL_NAME \ 48 | exp_ctrl.total_train_epochs=1 \ 49 | exp_ctrl.save_freq_steps=null \ 50 | n_nodes=1 \ 51 | allocation_mode=heuristic \ 52 | actor.type._class=$MODEL_FAMILY \ 53 | actor.path=$SFT_MODEL_PATH \ 54 | critic.type._class=$MODEL_FAMILY \ 55 | critic.type.is_critic=True \ 56 | critic.path=$RW_MODEL_PATH \ 57 | ref.type._class=$MODEL_FAMILY \ 58 | ref.path=$SFT_MODEL_PATH \ 59 | rew.type._class=$MODEL_FAMILY \ 60 | rew.type.is_critic=True \ 61 | rew.path=$RW_MODEL_PATH \ 62 | dataset.path=.data/ppo_prompt.jsonl \ 63 | dataset.max_prompt_len=128 \ 64 | dataset.train_bs_n_seqs=128 \ 65 | ppo.gen.max_new_tokens=512 \ 66 | ppo.gen.min_new_tokens=512 \ 67 | ppo.gen.use_cuda_graph=True \ 68 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 69 | ppo.ppo_n_minibatches=4 \ 70 | ppo.kl_ctl=0.1 \ 71 | ppo.value_eps_clip=0.2 \ 72 | ppo.reward_output_scaling=1.0 \ 73 | ppo.adv_norm=True ppo.value_norm=True -------------------------------------------------------------------------------- /examples/scripts/local/ppo_minibatched.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | # You can specify different model families for the SFT and the RW model, but you need to 3 | # re-tokenize the sequences if necessary. 4 | MODEL_FAMILY=llama 5 | 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints. 7 | # ReaL saves checkpoints with the same format as HuggingFace, 8 | # so you don't need to convert or split checkpoints explicitly. 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this 10 | # will not ensure the optimal algorithm performance. 11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/ 13 | 14 | # Option 1: The experiment runs locally with subprocesses. 15 | MODE=local 16 | # Option 2: The experiment runs in a Ray cluster 17 | # MODE=ray 18 | # Option 3: The experiment runs in a SLURM + pyxis cluster 19 | # Using the slurm mode requires a cluster spec file 20 | # and setting CLUSTER_SPEC_PATH to the path of it. 21 | # MODE=slurm 22 | 23 | # `experiment_name` and `trial_name` can be arbitrary. 24 | # Logs and saved checkpoints will be indexed by them. 25 | EXP_NAME=quickstart-ppo 26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic 27 | 28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy 29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc. 30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8). 31 | # ReaL will make full use of these available GPUs to design allocations. 32 | # This does not ensure the optimal throughput, but it is a good starting point. 33 | 34 | # The `heuristic` allocation mode is not ensured to run with every model configurations. 35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work. 36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually. 37 | 38 | # The `ppo` subcommand specifies that this is a PPO experiment. 39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 40 | # Enable it if you want to save checkpoints. 41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters. 42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters. 43 | # It's the user's responsibility to tune them appropriately. 44 | python3 -m realhf.apps.quickstart ppo \ 45 | mode=$MODE \ 46 | experiment_name=$EXP_NAME \ 47 | trial_name=$TRIAL_NAME \ 48 | exp_ctrl.total_train_epochs=1 \ 49 | exp_ctrl.save_freq_steps=null \ 50 | n_nodes=1 \ 51 | allocation_mode=heuristic \ 52 | actor.type._class=$MODEL_FAMILY \ 53 | actor.path=$SFT_MODEL_PATH \ 54 | critic.type._class=$MODEL_FAMILY \ 55 | critic.type.is_critic=True \ 56 | critic.path=$RW_MODEL_PATH \ 57 | ref.type._class=$MODEL_FAMILY \ 58 | ref.path=$SFT_MODEL_PATH \ 59 | rew.type._class=$MODEL_FAMILY \ 60 | rew.type.is_critic=True \ 61 | rew.path=$RW_MODEL_PATH \ 62 | dataset.path=.data/ppo_prompt.jsonl \ 63 | dataset.max_prompt_len=128 \ 64 | dataset.train_bs_n_seqs=1024 \ 65 | ppo.gen.max_new_tokens=512 \ 66 | ppo.gen.min_new_tokens=512 \ 67 | ppo.gen.use_cuda_graph=True \ 68 | ppo.gen.force_no_logits_mask=True \ 69 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 70 | ppo.ppo_n_minibatches=4 \ 71 | ppo.kl_ctl=0.1 \ 72 | ppo.value_eps_clip=0.2 \ 73 | ppo.reward_output_scaling=1.0 \ 74 | ppo.adv_norm=True ppo.value_norm=True \ 75 | actor_gen.n_mbs=2 \ 76 | actor_train.n_mbs=4 \ 77 | critic_inf.n_mbs=4 \ 78 | critic_train.n_mbs=4 \ 79 | rew_inf.n_mbs=2 \ 80 | ref_inf.n_mbs=8 -------------------------------------------------------------------------------- /examples/scripts/local/ppo_symm.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | # You can specify different model families for the SFT and the RW model, but you need to 3 | # re-tokenize the sequences if necessary. 4 | MODEL_FAMILY=llama 5 | 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints. 7 | # ReaL saves checkpoints with the same format as HuggingFace, 8 | # so you don't need to convert or split checkpoints explicitly. 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this 10 | # will not ensure the optimal algorithm performance. 11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/ 12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/ 13 | 14 | # Option 1: The experiment runs locally with subprocesses. 15 | MODE=local 16 | # Option 2: The experiment runs in a Ray cluster 17 | # MODE=ray 18 | # Option 3: The experiment runs in a SLURM + pyxis cluster 19 | # Using the slurm mode requires a cluster spec file 20 | # and setting CLUSTER_SPEC_PATH to the path of it. 21 | # MODE=slurm 22 | 23 | # `experiment_name` and `trial_name` can be arbitrary. 24 | # Logs and saved checkpoints will be indexed by them. 25 | EXP_NAME=quickstart-ppo 26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 27 | 28 | # When using the "manual" allocation mode, the user should specify the device allocation 29 | # and parallel strategies for each model function calls. 30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8). 31 | # We provide a template in the following command and the user can modify it according to 32 | # the specific model and the available GPUs. 33 | 34 | # The `ppo` subcommand specifies that this is a PPO experiment. 35 | # The `save_freq_steps` is set to `null` to disable saving checkpoints. 36 | # Enable it if you want to save checkpoints. 37 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters. 38 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters. 39 | # It's the user's responsibility to tune them appropriately. 40 | # The allocation of model function calls is specified by a pattern `hostname:gpu_id1,gpu_id2,...` 41 | # for slicing GPUS of a single node, and `hostname1,hostname2` for multiple nodes. 42 | # Only 1, 2, 4, 8 GPUs on a single node or multiple complete nodes (e.g., 16, 24) are supported. 43 | # If the CLUSTER_SPEC_PATH is not set, `hostname`s are NODE01, NODE02, etc, otherwise it's the 44 | # hostname specified in this file. The `gpu_id`s are the GPU indices on the host, 45 | # from 0 to `n_gpus_per_node` (defaults to 8, can be changed) - 1. 46 | # Once allocations are all set, parallel strategies can be specified as long as the world size 47 | # equals to the number of GPUs in the allocation. 48 | 49 | # The following command shows an example of manual allocation on two nodes, 50 | # but it can be modified according to the specific model and the available GPUs. 51 | unset CLUSTER_SPEC_PATH 52 | python3 -m realhf.apps.quickstart ppo \ 53 | mode=$MODE \ 54 | experiment_name=$EXP_NAME \ 55 | trial_name=$TRIAL_NAME \ 56 | exp_ctrl.total_train_epochs=1 \ 57 | exp_ctrl.save_freq_steps=null \ 58 | actor.type._class=$MODEL_FAMILY \ 59 | actor.path=$SFT_MODEL_PATH \ 60 | actor.optimizer.lr_scheduler_type=constant \ 61 | actor.optimizer.lr=1e-4 \ 62 | actor.optimizer.warmup_steps_proportion=0.0 \ 63 | critic.type._class=$MODEL_FAMILY \ 64 | critic.type.is_critic=True \ 65 | critic.path=$RW_MODEL_PATH \ 66 | ref.type._class=$MODEL_FAMILY \ 67 | ref.path=$SFT_MODEL_PATH \ 68 | rew.type._class=$MODEL_FAMILY \ 69 | rew.type.is_critic=True \ 70 | rew.path=$RW_MODEL_PATH \ 71 | dataset.path=.data/ppo_prompt.jsonl \ 72 | dataset.max_prompt_len=128 \ 73 | dataset.train_bs_n_seqs=128 \ 74 | ppo.gen.max_new_tokens=512 \ 75 | ppo.gen.min_new_tokens=512 \ 76 | ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \ 77 | ppo.ppo_n_minibatches=4 \ 78 | ppo.kl_ctl=0.1 \ 79 | ppo.value_eps_clip=0.2 \ 80 | ppo.reward_output_scaling=10.0 \ 81 | ppo.adv_norm=True ppo.value_norm=True \ 82 | allocation_mode=m2d2p2 \ 83 | actor_gen.n_mbs=2 \ 84 | actor_train.n_mbs=4 \ 85 | ref_inf.n_mbs=2 86 | -------------------------------------------------------------------------------- /examples/scripts/local/rw.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=llama 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint. 5 | # The latter is the common practice. 6 | # ReaL saves checkpoints with the same format as HuggingFace, 7 | # so you don't need to convert or split checkpoints explicitly. 8 | # HF pretrained checkpoint 9 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 10 | # or SFT checkpoint 11 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/llama-local-manual/default/epoch7epochstep5globalstep50/ 12 | 13 | # Option 1: The experiment runs locally with subprocesses. 14 | MODE=local 15 | # Option 2: The experiment runs in a Ray cluster 16 | # MODE=ray 17 | # Option 3: The experiment runs in a SLURM + pyxis cluster 18 | # Using the slurm mode requires a cluster spec file 19 | # and setting CLUSTER_SPEC_PATH to the path of it. 20 | # MODE=slurm 21 | 22 | # `experiment_name` and `trial_name` can be arbitrary. 23 | # Logs and saved checkpoints will be indexed by them. 24 | EXP_NAME=quickstart-rw 25 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 26 | 27 | # We use the "manual" allocation mode here to manually specify the parallelism strategy, 28 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs. 29 | 30 | # The `rw` subcommand specifies that this is a reward modeling experiment. 31 | # The reward modeling experiment converges very fast, so we set a smaller 32 | # `total_train_epochs` and `save_freq_steps` for demonstration. 33 | # Note that we set `model.type.is_critic=True` to initialize a reward model from the LLM 34 | # by re-initializing the LM head. 35 | python3 -m realhf.apps.quickstart rw \ 36 | mode=$MODE \ 37 | experiment_name=$EXP_NAME \ 38 | trial_name=$TRIAL_NAME \ 39 | exp_ctrl.total_train_epochs=2 \ 40 | exp_ctrl.save_freq_steps=10 \ 41 | exp_ctrl.eval_freq_epochs=1 \ 42 | model.optimizer.type=adam \ 43 | model.optimizer.lr_scheduler_type=cosine \ 44 | model.optimizer.lr=1e-5 \ 45 | model.optimizer.warmup_steps_proportion=0.02 \ 46 | model.type._class=$MODEL_FAMILY \ 47 | model.type.is_critic=True \ 48 | model.path=$PRETRAINED_PATH \ 49 | dataset.train_path=.data/rm_paired-train.jsonl \ 50 | dataset.valid_path=.data/rm_paired-valid.jsonl \ 51 | dataset.max_seqlen=1024 \ 52 | dataset.train_bs_n_seqs=512 \ 53 | dataset.valid_bs_n_seqs=512 \ 54 | allocation_mode=manual \ 55 | n_nodes=1 \ 56 | allocation.parallel.pipeline_parallel_size=2 \ 57 | allocation.parallel.model_parallel_size=2 \ 58 | allocation.parallel.data_parallel_size=2 \ 59 | allocation.parallel.use_sequence_parallel=True -------------------------------------------------------------------------------- /examples/scripts/local/sft.sh: -------------------------------------------------------------------------------- 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model. 2 | MODEL_FAMILY=llama 3 | 4 | # PRETRAINED_PATH is the HuggingFace checkpoint. 5 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf 6 | 7 | # Option 1: The experiment runs locally with subprocesses. 8 | MODE=local 9 | # Option 2: The experiment runs in a Ray cluster 10 | # MODE=ray 11 | # Option 3: The experiment runs in a SLURM + pyxis cluster 12 | # Using the slurm mode requires a cluster spec file 13 | # and setting CLUSTER_SPEC_PATH to the path of it. 14 | # MODE=slurm 15 | 16 | # `experiment_name` and `trial_name` can be arbitrary. 17 | # Logs and saved checkpoints will be indexed by them. 18 | EXP_NAME=quickstart-sft 19 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual 20 | 21 | # We use the "manual" allocation mode here to manually specify the parallelism strategy, 22 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs. 23 | 24 | # The `sft` subcommand specifies that this is a supervised fine-tuning experiment. 25 | python3 -m realhf.apps.quickstart sft \ 26 | mode=$MODE \ 27 | experiment_name=$EXP_NAME \ 28 | trial_name=$TRIAL_NAME \ 29 | exp_ctrl.total_train_epochs=8 \ 30 | exp_ctrl.save_freq_steps=50 \ 31 | exp_ctrl.eval_freq_epochs=1 \ 32 | model.optimizer.type=adam \ 33 | model.optimizer.lr_scheduler_type=cosine \ 34 | model.optimizer.lr=1e-5 \ 35 | model.optimizer.warmup_steps_proportion=0.02 \ 36 | model.type._class=$MODEL_FAMILY \ 37 | model.path=$PRETRAINED_PATH \ 38 | dataset.train_path=.data/sft_pos-train.jsonl \ 39 | dataset.valid_path=.data/sft_pos-train.jsonl \ 40 | dataset.max_seqlen=1024 \ 41 | dataset.train_bs_n_seqs=2048 \ 42 | dataset.valid_bs_n_seqs=512 \ 43 | allocation_mode=manual \ 44 | n_nodes=1 \ 45 | allocation.parallel.pipeline_parallel_size=2 \ 46 | allocation.parallel.model_parallel_size=2 \ 47 | allocation.parallel.data_parallel_size=2 \ 48 | allocation.parallel.use_sequence_parallel=True -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "packaging", "torch", "pybind11>=2.10.0", "build>=1.2.1"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "realhf" 7 | description = "ReaL: Efficient RLHF Training of Large Language Models with Parameter Reallocation" 8 | readme = "README.md" 9 | requires-python = ">=3.10,<3.12" 10 | dynamic = ["version"] 11 | authors = [ 12 | { name = "Zhiyu Mei", email = "meizy20@mails.tsinghua.edu.cn" }, 13 | { name = "Wei Fu", email = "fuwth17@gmail.com" }, 14 | ] 15 | maintainers = [ 16 | { name = "Zhiyu Mei", email = "meizy20@mails.tsinghua.edu.cn" }, 17 | { name = "Wei Fu", email = "fuwth17@gmail.com" }, 18 | ] 19 | keywords = [ 20 | "distributed-systems", 21 | "reinforcement-learning-from-human-feedback", 22 | "large-language-models", 23 | "llm-training", 24 | ] 25 | classifiers = [ 26 | # 3 - Alpha 27 | # 4 - Beta 28 | # 5 - Production/Stable 29 | "Development Status :: 2 - Pre-Alpha", 30 | "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2", 31 | "Intended Audience :: Developers", 32 | "Programming Language :: Python :: 3", 33 | "Programming Language :: Python :: 3.10", 34 | ] 35 | 36 | [project.urls] 37 | Homepage = "https://github.com/openpsi-project/ReaLHF" 38 | Issues = "https://github.com/openpsi-project/ReaLHF/issues" 39 | Documentation = "https://openpsi-project.github.io/ReaLHF/" 40 | Repository = "https://github.com/openpsi-project/ReaLHF" 41 | 42 | [tool.setuptools.dynamic] 43 | version = {attr = "realhf.__version__"} 44 | 45 | [tool.setuptools.packages.find] 46 | where = ["."] # ["."] by default 47 | # include = ["csrc/*", "realhf/*"] # ["*"] by default 48 | # exclude = ["tests", "docker"] # empty by default 49 | # namespaces = false # true by default 50 | 51 | [tool.isort] 52 | profile = "black" 53 | 54 | [tool.pytest.ini_options] 55 | pythonpath = ["."] 56 | 57 | [tool.black] 58 | line-length = 88 59 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning 4 | ignore::UserWarning -------------------------------------------------------------------------------- /realhf/__init__.py: -------------------------------------------------------------------------------- 1 | # Re-import these classes for clear documentation, 2 | # otherwise the name will have a long prefix like 3 | # realhf.api.quickstart.model.ModelTrainEvalConfig. 4 | from .api.core.config import ModelFamily, ModelName, ModelShardID 5 | from .api.core.data_api import SequenceSample 6 | from .api.core.dfg import MFCDef 7 | from .api.core.model_api import ( 8 | FinetuneSpec, 9 | GenerationHyperparameters, 10 | Model, 11 | ModelBackend, 12 | ModelInterface, 13 | ModelVersion, 14 | PipelinableEngine, 15 | ReaLModelConfig, 16 | ) 17 | from .api.quickstart.dataset import ( 18 | PairedComparisonDatasetConfig, 19 | PromptAnswerDatasetConfig, 20 | PromptOnlyDatasetConfig, 21 | ) 22 | from .api.quickstart.device_mesh import MFCConfig 23 | from .api.quickstart.model import ( 24 | ModelTrainEvalConfig, 25 | OptimizerConfig, 26 | ParallelismConfig, 27 | ) 28 | from .experiments.common.common import CommonExperimentConfig, ExperimentSaveEvalControl 29 | from .experiments.common.dpo_exp import DPOConfig 30 | from .experiments.common.gen_exp import GenerationConfig 31 | from .experiments.common.ppo_exp import PPOConfig, PPOHyperparameters 32 | from .experiments.common.rw_exp import RWConfig 33 | from .experiments.common.sft_exp import SFTConfig 34 | 35 | __version__ = "0.3.0" 36 | -------------------------------------------------------------------------------- /realhf/api/from_hf/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from realhf.base.importing import import_module 5 | 6 | import_module(os.path.dirname(__file__), re.compile(r"^(?!.*__init__).*\.py$")) 7 | -------------------------------------------------------------------------------- /realhf/api/from_hf/gemma.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import * 3 | 4 | import torch 5 | import transformers 6 | 7 | from realhf.api.core.model_api import ReaLModelConfig, register_hf_family 8 | from realhf.base.testing import ( 9 | TESTING_MODEL_HEAD_DIM, 10 | TESTING_MODEL_HIDDEN_SIZE, 11 | TESTING_MODEL_INTERMEDIATE_SIZE, 12 | TESTING_MODEL_N_HEADS, 13 | TESTING_MODEL_N_LAYERS, 14 | TESTING_MODEL_N_POSITIONS, 15 | TESTING_MODEL_VOCAB_SIZE, 16 | ) 17 | 18 | from .llama import ( 19 | convert_state_dict_llama, 20 | llama_embedding_layer_names, 21 | llama_output_head_param_name, 22 | llama_transformer_block_param_name, 23 | to_llama_state_dict, 24 | ) 25 | 26 | 27 | def convert_config_gemma( 28 | hf_config: transformers.GemmaConfig, 29 | ) -> ReaLModelConfig: 30 | if hf_config.hidden_activation is None: 31 | act = "gelu_pytorch_tanh" 32 | else: 33 | act = hf_config.hidden_activation 34 | return ReaLModelConfig( 35 | n_layers=hf_config.num_hidden_layers, 36 | n_kv_heads=hf_config.num_key_value_heads, 37 | hidden_dim=hf_config.hidden_size, 38 | n_q_heads=hf_config.num_attention_heads, 39 | head_dim=hf_config.head_dim, 40 | intermediate_dim=hf_config.intermediate_size, 41 | vocab_size=hf_config.vocab_size, 42 | n_positions=hf_config.max_position_embeddings, 43 | embd_pdrop=0.0, 44 | attn_pdrop=( 45 | hf_config.attention_dropout 46 | if hasattr(hf_config, "attention_dropout") 47 | else 0.1 48 | ), 49 | layer_norm_epsilon=hf_config.rms_norm_eps, 50 | activation_function=act, # NOTE: here is different than LLaMA 51 | use_attention_bias=hf_config.attention_bias, 52 | use_attn_proj_bias=hf_config.attention_bias, 53 | scale_attn_by_inverse_layer_idx=False, 54 | layer_norm_type="gemma", 55 | mlp_type="llama", 56 | apply_rotary=True, 57 | rotary_base=hf_config.rope_theta, 58 | rotary_interleaved=False, 59 | tied_embedding=hf_config.tie_word_embeddings, 60 | normalize_embed=True, 61 | ) 62 | 63 | 64 | def convert_config_back_gemma( 65 | config: ReaLModelConfig, 66 | ) -> transformers.GemmaConfig: 67 | return transformers.GemmaConfig( 68 | vocab_size=config.vocab_size, 69 | hidden_size=config.hidden_dim, 70 | intermediate_size=config.intermediate_dim, 71 | num_hidden_layers=config.n_layers, 72 | num_key_value_heads=config.n_kv_heads, 73 | num_attention_heads=config.n_q_heads, 74 | head_dim=config.head_dim, 75 | max_position_embeddings=config.n_positions, 76 | rms_norm_eps=config.layer_norm_epsilon, 77 | hidden_act=config.activation_function, 78 | hidden_activation=config.activation_function, 79 | attention_bias=config.use_attention_bias, 80 | attention_dropout=config.attn_pdrop, 81 | rope_theta=config.rotary_base, 82 | tie_word_embeddings=config.tied_embedding, 83 | architectures=["GemmaForCausalLM"], 84 | ) 85 | 86 | 87 | def gemma_config_maker() -> ReaLModelConfig: 88 | hf_config = transformers.GemmaConfig( 89 | attention_bias=False, 90 | hidden_act="gelu", 91 | hidden_size=TESTING_MODEL_HIDDEN_SIZE, 92 | intermediate_size=TESTING_MODEL_INTERMEDIATE_SIZE, 93 | max_position_embeddings=TESTING_MODEL_N_POSITIONS, 94 | num_attention_heads=TESTING_MODEL_N_HEADS, 95 | num_hidden_layers=TESTING_MODEL_N_LAYERS, 96 | num_key_value_heads=4, 97 | head_dim=TESTING_MODEL_HEAD_DIM, 98 | rms_norm_eps=1e-06, 99 | rope_theta=10000.0, 100 | vocab_size=TESTING_MODEL_VOCAB_SIZE, 101 | ) 102 | return convert_config_gemma(hf_config) 103 | 104 | 105 | register_hf_family( 106 | name="gemma", 107 | hf_cls_name="GemmaForCausalLM", 108 | config_from_hf_converter=convert_config_gemma, 109 | config_to_hf_converter=convert_config_back_gemma, 110 | sd_from_hf_converter=convert_state_dict_llama, 111 | sd_to_hf_converter=to_llama_state_dict, 112 | embedding_param_names=llama_embedding_layer_names, 113 | tblock_param_names=llama_transformer_block_param_name, 114 | head_param_names=llama_output_head_param_name, 115 | real_config_maker=gemma_config_maker, 116 | ) 117 | -------------------------------------------------------------------------------- /realhf/api/from_hf/mistral.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | 4 | from realhf.api.core.model_api import ReaLModelConfig, register_hf_family 5 | from realhf.base.testing import ( 6 | TESTING_MODEL_HEAD_DIM, 7 | TESTING_MODEL_HIDDEN_SIZE, 8 | TESTING_MODEL_INTERMEDIATE_SIZE, 9 | TESTING_MODEL_N_HEADS, 10 | TESTING_MODEL_N_LAYERS, 11 | TESTING_MODEL_N_POSITIONS, 12 | TESTING_MODEL_VOCAB_SIZE, 13 | ) 14 | 15 | from .llama import ( 16 | convert_state_dict_llama, 17 | llama_embedding_layer_names, 18 | llama_output_head_param_name, 19 | llama_transformer_block_param_name, 20 | to_llama_state_dict, 21 | ) 22 | 23 | 24 | def config_from_mistral(hf_config: transformers.MistralConfig) -> ReaLModelConfig: 25 | return ReaLModelConfig( 26 | n_layers=hf_config.num_hidden_layers, 27 | vocab_size=hf_config.vocab_size, 28 | hidden_dim=hf_config.hidden_size, 29 | n_q_heads=hf_config.num_attention_heads, 30 | n_kv_heads=hf_config.num_key_value_heads, 31 | head_dim=hf_config.hidden_size // hf_config.num_attention_heads, 32 | intermediate_dim=hf_config.intermediate_size, 33 | activation_function=hf_config.hidden_act, 34 | n_positions=hf_config.max_position_embeddings, 35 | layer_norm_epsilon=hf_config.rms_norm_eps, 36 | layer_norm_type="rms", 37 | tied_embedding=hf_config.tie_word_embeddings, 38 | mlp_type="llama", 39 | rotary_base=hf_config.rope_theta, 40 | apply_rotary=True, 41 | attn_pdrop=hf_config.attention_dropout, 42 | resid_pdrop=0.0, 43 | use_attention_bias=False, 44 | use_attn_proj_bias=False, 45 | embd_pdrop=0.0, 46 | sliding_window=hf_config.sliding_window, 47 | scale_attn_by_inverse_layer_idx=False, 48 | ) 49 | 50 | 51 | def config_to_mistral(config: ReaLModelConfig) -> transformers.MistralConfig: 52 | return transformers.MistralConfig( 53 | num_hidden_layers=config.n_layers, 54 | vocab_size=config.vocab_size, 55 | hidden_size=config.hidden_dim, 56 | num_attention_heads=config.n_q_heads, 57 | num_key_value_heads=config.n_kv_heads, 58 | intermediate_size=config.intermediate_dim, 59 | hidden_act=config.activation_function, 60 | max_position_embeddings=config.n_positions, 61 | rms_norm_eps=config.layer_norm_epsilon, 62 | tie_word_embeddings=False, 63 | rope_theta=config.rotary_base, 64 | attention_dropout=config.attn_pdrop, 65 | sliding_window=config.sliding_window, 66 | architectures=["MistralForCausalLM"], 67 | ) 68 | 69 | 70 | def get_real_config_mistral() -> ReaLModelConfig: 71 | hf_config = transformers.MistralConfig( 72 | vocab_size=TESTING_MODEL_VOCAB_SIZE, 73 | max_position_embeddings=TESTING_MODEL_N_POSITIONS, 74 | hidden_size=TESTING_MODEL_HIDDEN_SIZE, 75 | intermediate_size=TESTING_MODEL_INTERMEDIATE_SIZE, 76 | num_hidden_layers=TESTING_MODEL_N_LAYERS, 77 | num_attention_heads=TESTING_MODEL_N_HEADS, 78 | num_key_value_heads=2, 79 | ) 80 | return config_from_mistral(hf_config) 81 | 82 | 83 | register_hf_family( 84 | "mistral", 85 | "MistralForCausalLM", 86 | config_from_hf_converter=config_from_mistral, 87 | config_to_hf_converter=config_to_mistral, 88 | sd_from_hf_converter=convert_state_dict_llama, 89 | sd_to_hf_converter=to_llama_state_dict, 90 | embedding_param_names=llama_embedding_layer_names, 91 | tblock_param_names=llama_transformer_block_param_name, 92 | head_param_names=llama_output_head_param_name, 93 | real_config_maker=get_real_config_mistral, 94 | ) 95 | -------------------------------------------------------------------------------- /realhf/api/from_hf/qwen2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import * 3 | 4 | import torch 5 | import transformers 6 | 7 | from realhf.api.core.model_api import ReaLModelConfig, register_hf_family 8 | from realhf.base.testing import ( 9 | TESTING_MODEL_HEAD_DIM, 10 | TESTING_MODEL_HIDDEN_SIZE, 11 | TESTING_MODEL_INTERMEDIATE_SIZE, 12 | TESTING_MODEL_N_HEADS, 13 | TESTING_MODEL_N_LAYERS, 14 | TESTING_MODEL_N_POSITIONS, 15 | TESTING_MODEL_VOCAB_SIZE, 16 | ) 17 | 18 | from .llama import ( 19 | convert_state_dict_llama, 20 | llama_embedding_layer_names, 21 | llama_output_head_param_name, 22 | llama_transformer_block_param_name, 23 | to_llama_state_dict, 24 | ) 25 | 26 | 27 | def convert_config_qwen2( 28 | hf_config: transformers.Qwen2Config, 29 | ) -> ReaLModelConfig: 30 | return ReaLModelConfig( 31 | n_layers=hf_config.num_hidden_layers, 32 | n_kv_heads=hf_config.num_key_value_heads, 33 | hidden_dim=hf_config.hidden_size, 34 | n_q_heads=hf_config.num_attention_heads, 35 | head_dim=hf_config.hidden_size // hf_config.num_attention_heads, 36 | intermediate_dim=hf_config.intermediate_size, 37 | vocab_size=hf_config.vocab_size, 38 | n_positions=hf_config.max_position_embeddings, 39 | embd_pdrop=0.0, 40 | attn_pdrop=( 41 | hf_config.attention_dropout 42 | if hasattr(hf_config, "attention_dropout") 43 | else 0.1 44 | ), 45 | layer_norm_epsilon=hf_config.rms_norm_eps, 46 | activation_function=hf_config.hidden_act, 47 | use_attention_bias=True, 48 | use_attn_proj_bias=False, 49 | scale_attn_by_inverse_layer_idx=False, 50 | layer_norm_type="rms", 51 | mlp_type="llama", 52 | apply_rotary=True, 53 | rotary_base=hf_config.rope_theta, 54 | rotary_interleaved=False, 55 | tied_embedding=hf_config.tie_word_embeddings, 56 | ) 57 | 58 | 59 | def convert_config_back_qwen2( 60 | config: ReaLModelConfig, 61 | ) -> transformers.Qwen2Config: 62 | return transformers.Qwen2Config( 63 | vocab_size=config.vocab_size, 64 | hidden_size=config.hidden_dim, 65 | intermediate_size=config.intermediate_dim, 66 | num_hidden_layers=config.n_layers, 67 | num_key_value_heads=config.n_kv_heads, 68 | num_attention_heads=config.n_q_heads, 69 | max_position_embeddings=config.n_positions, 70 | rms_norm_eps=config.layer_norm_epsilon, 71 | hidden_act=config.activation_function, 72 | attention_dropout=config.attn_pdrop, 73 | rope_theta=config.rotary_base, 74 | tie_word_embeddings=config.tied_embedding, 75 | architectures=["Qwen2ForCausalLM"], 76 | ) 77 | 78 | 79 | def qwen2_config_maker(): 80 | hf_config = transformers.Qwen2Config( 81 | vocab_size=TESTING_MODEL_VOCAB_SIZE, 82 | max_position_embeddings=TESTING_MODEL_N_POSITIONS, 83 | hidden_size=TESTING_MODEL_HIDDEN_SIZE, 84 | intermediate_size=TESTING_MODEL_INTERMEDIATE_SIZE, 85 | num_hidden_layers=TESTING_MODEL_N_LAYERS, 86 | num_attention_heads=TESTING_MODEL_N_HEADS, 87 | num_key_value_heads=8, 88 | hidden_act="silu", 89 | rms_norm_eps=1e-5, 90 | ) 91 | return convert_config_qwen2(hf_config) 92 | 93 | 94 | register_hf_family( 95 | name="qwen2", 96 | hf_cls_name="Qwen2ForCausalLM", 97 | config_from_hf_converter=convert_config_qwen2, 98 | config_to_hf_converter=convert_config_back_qwen2, 99 | sd_from_hf_converter=convert_state_dict_llama, 100 | sd_to_hf_converter=to_llama_state_dict, 101 | embedding_param_names=llama_embedding_layer_names, 102 | tblock_param_names=llama_transformer_block_param_name, 103 | head_param_names=llama_output_head_param_name, 104 | real_config_maker=qwen2_config_maker, 105 | ) 106 | -------------------------------------------------------------------------------- /realhf/api/quickstart/__init__.py: -------------------------------------------------------------------------------- 1 | # NOTE: required by hydra 2 | -------------------------------------------------------------------------------- /realhf/api/quickstart/dataset.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | 3 | 4 | @dataclasses.dataclass 5 | class PromptAnswerDatasetConfig: 6 | """Configuration for datasets used in Supervised Fine-Tuning (SFT). 7 | 8 | The raw data must be in a JSON or JSONL file format, where each entry is a dictionary 9 | with the keys `prompt` and `answer`. Both `prompt` and `answer` must be strings. 10 | 11 | :param train_path: Path to the training dataset. 12 | :type train_path: str 13 | :param valid_path: Path to the validation dataset. 14 | :type valid_path: str 15 | :param max_seqlen: Maximum sequence length (prompt + answer). Sequences longer than 16 | this will be truncated. 17 | :type max_seqlen: int 18 | :param train_bs_n_seqs: Number of sequences in each batch during training. 19 | :type train_bs_n_seqs: int 20 | :param valid_bs_n_seqs: Number of sequences in each batch during validation. 21 | :type valid_bs_n_seqs: int 22 | :param pad_to_max_length: Whether to pad sequences to the maximum length. If True, 23 | all mini-batches created by the DP balanced partitioning algorithm will have 24 | the same number of tokens, making MFC time predictable. This option is used 25 | only for benchmarking purposes. 26 | :type pad_to_max_length: bool 27 | """ 28 | 29 | train_path: str = "" 30 | valid_path: str = "" 31 | max_seqlen: int = 1024 32 | train_bs_n_seqs: int = 256 33 | valid_bs_n_seqs: int = 256 34 | pad_to_max_length: bool = False 35 | 36 | 37 | @dataclasses.dataclass 38 | class PairedComparisonDatasetConfig: 39 | """Configuration for datasets used in paired-comparison reward modeling, 40 | DPO, and SimPO. 41 | 42 | The raw data must be in a JSON or JSONL file format, where each entry is a dictionary 43 | with the keys `prompt`, `pos_answers`, and `neg_answers`. `prompt` is a string, while 44 | `pos_answers` and `neg_answers` are lists of strings. The lists must have the same length. 45 | 46 | The raw dataset may contain multiple answer pairs for each prompt. In each epoch, we will 47 | randomly sample `max_pairs_per_prompt` answer pairs for each prompt, so the maximum batch 48 | size (in terms of the number of sequences) per step is `train_bs_n_seqs` multiplied by 49 | `max_pairs_per_prompt`. 50 | 51 | :param train_path: Path to the training dataset. 52 | :type train_path: str 53 | :param valid_path: Path to the evaluation dataset. 54 | :type valid_path: str 55 | :param max_pairs_per_prompt: Maximum number of answer pairs per prompt. 56 | :type max_pairs_per_prompt: int 57 | :param max_seqlen: Maximum sequence length (prompt + answers). Sequences longer than 58 | this will be truncated. 59 | :type max_seqlen: int 60 | :param train_bs_n_seqs: Number of sequences in each batch during training. 61 | :type train_bs_n_seqs: int 62 | :param valid_bs_n_seqs: Number of sequences in each batch during validation. 63 | :type valid_bs_n_seqs: int 64 | """ 65 | 66 | train_path: str = "" 67 | valid_path: str = "" 68 | max_pairs_per_prompt: int = 2 69 | max_seqlen: int = 1024 70 | train_bs_n_seqs: int = 256 71 | valid_bs_n_seqs: int = 256 72 | 73 | 74 | @dataclasses.dataclass 75 | class PromptOnlyDatasetConfig: 76 | """Configuration for datasets used in PPO RLHF. 77 | 78 | The raw data must be in a JSON or JSONL file format, where each entry is a dictionary 79 | with a single key called `prompt`, which is a string. 80 | 81 | :param path: Path to the dataset. 82 | :type path: str 83 | :param max_prompt_len: Maximum length of the prompt. Prompts longer than this will 84 | be truncated. 85 | :type max_prompt_len: int 86 | :param train_bs_n_seqs: Number of prompts in each batch. 87 | :type train_bs_n_seqs: int 88 | :param pad_to_max_length: Whether to pad prompts to the maximum length. If True, 89 | all mini-batches created by the DP balanced partitioning algorithm will have 90 | the same number of tokens, making MFC time predictable. This option is used 91 | only for benchmarking purposes. 92 | :type pad_to_max_length: bool 93 | """ 94 | 95 | path: str = "" 96 | max_prompt_len: int = 256 97 | train_bs_n_seqs: int = 256 98 | pad_to_max_length: bool = False 99 | -------------------------------------------------------------------------------- /realhf/api/quickstart/entrypoint.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import datetime 3 | import functools 4 | import inspect 5 | import json 6 | import os 7 | import pickle 8 | import subprocess 9 | from typing import Callable, Optional 10 | 11 | import hydra 12 | import omegaconf 13 | from hydra.core.config_store import ConfigStore 14 | from omegaconf import MISSING, OmegaConf 15 | 16 | import realhf.api.core.system_api as system_api 17 | from realhf.base.constants import LOG_ROOT, MODEL_SAVE_ROOT, QUICKSTART_EXPR_CACHE_PATH 18 | from realhf.base.ray_utils import check_ray_availability 19 | from realhf.base.slurm_utils import check_slurm_availability 20 | 21 | 22 | def kind_reminder(config_name, logger, args): 23 | logger.info(f"Running {config_name} experiment.") 24 | logger.info( 25 | f"Logs will be dumped to {os.path.join(LOG_ROOT, args.experiment_name, args.trial_name)}" 26 | ) 27 | logger.info( 28 | f"Model checkpoints will be saved to {os.path.join(MODEL_SAVE_ROOT, args.experiment_name, args.trial_name)}" 29 | ) 30 | 31 | if args.mode == "slurm": 32 | slurm_available = check_slurm_availability() 33 | if slurm_available: 34 | logger.info("Launching experiments with SLURM...") 35 | else: 36 | logger.warning("Slurm is not available. Using local mode.") 37 | args.mode = "local" 38 | elif args.mode == "ray": 39 | ray_available = check_ray_availability() 40 | if ray_available: 41 | logger.info("Launching experiments with RAY...") 42 | else: 43 | logger.warning("Ray is not available. Using local mode.") 44 | args.mode = "local" 45 | elif args.mode == "local": 46 | logger.info("Launching experiments locally.") 47 | else: 48 | raise ValueError(f"Invalid mode {args.mode}") 49 | 50 | 51 | cs = ConfigStore.instance() 52 | QUICKSTART_CONFIG_CLASSES = {} 53 | QUICKSTART_USERCODE_PATHS = {} 54 | QUICKSTART_FN = {} 55 | 56 | 57 | def register_quickstart_exp(config_name: str, exp_cls: Callable): 58 | usercode_path = os.path.abspath(inspect.getfile(inspect.currentframe().f_back)) 59 | 60 | @hydra.main(version_base=None, config_name=config_name) 61 | def run(args): 62 | # NOTE: we import logging here to avoid hydra logging overwrite 63 | import realhf.base.logging as logging 64 | 65 | logger = logging.getLogger("quickstart", "colored") 66 | 67 | exp_name = args.experiment_name 68 | if args.trial_name == MISSING: 69 | args.trial_name = trial_name = ( 70 | f"run{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" 71 | ) 72 | else: 73 | trial_name = args.trial_name 74 | from realhf.apps.main import main_start, main_stop 75 | 76 | kind_reminder(config_name, logger, args) 77 | 78 | exp_fn = functools.partial(exp_cls, **args) 79 | 80 | os.makedirs(os.path.dirname(QUICKSTART_EXPR_CACHE_PATH), exist_ok=True) 81 | cache_file = os.path.join( 82 | QUICKSTART_EXPR_CACHE_PATH, f"{exp_name}_{trial_name}.json" 83 | ) 84 | with open(cache_file, "w") as f: 85 | dict_args = OmegaConf.to_container(args) 86 | json.dump( 87 | dict( 88 | args=dict_args, 89 | usercode_path=usercode_path, 90 | config_name=config_name, 91 | ), 92 | f, 93 | indent=4, 94 | ensure_ascii=False, 95 | ) 96 | 97 | system_api.register_experiment(exp_name, exp_fn) 98 | 99 | try: 100 | main_start(args) 101 | except Exception as e: 102 | main_stop(args) 103 | logger.warning("Exception occurred. Stopping all workers.") 104 | raise e 105 | 106 | cs.store(name=config_name, node=exp_cls) 107 | 108 | # assert config_name not in QUICKSTART_CONFIG_CLASSES 109 | QUICKSTART_CONFIG_CLASSES[config_name] = exp_cls 110 | # assert config_name not in QUICKSTART_USERCODE_PATHS 111 | QUICKSTART_USERCODE_PATHS[config_name] = usercode_path 112 | # assert config_name not in QUICKSTART_FN 113 | QUICKSTART_FN[config_name] = run 114 | return run 115 | -------------------------------------------------------------------------------- /realhf/api/quickstart/search.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import List, Optional 3 | 4 | from realhf.api.core.dfg import MFCDef 5 | from realhf.api.quickstart.device_mesh import DeviceMesh 6 | from realhf.api.quickstart.model import ParallelismConfig 7 | 8 | 9 | @dataclasses.dataclass 10 | class RPCExecution: 11 | rpc: MFCDef 12 | device_mesh: DeviceMesh 13 | parallel_strategy: ParallelismConfig 14 | time_cost: Optional[int] = None 15 | mem: Optional[int] = None 16 | static_mem: Optional[int] = None 17 | 18 | def __repr__(self): 19 | return f"RPCExecution({self.rpc}, {self.device_mesh}, {self.parallel_strategy})" 20 | 21 | def __hash__(self): 22 | return hash( 23 | ( 24 | self.rpc.name, 25 | self.device_mesh.cluster_mesh, 26 | self.device_mesh.device_mesh_name, 27 | str(self.parallel_strategy), 28 | ) 29 | ) 30 | 31 | 32 | @dataclasses.dataclass 33 | class RPCInstance: 34 | rpc: MFCDef 35 | iteration_id: int 36 | parents: List[MFCDef] 37 | children: List[MFCDef] 38 | 39 | @property 40 | def name(self): 41 | return f"{self.rpc.name}:{self.iteration_id}" 42 | 43 | def __repr__(self): 44 | if len(self.parents) == 0 and len(self.children) == 0: 45 | return f"RPCInstance({self.rpc.name}, {self.iteration_id})" 46 | else: 47 | return ( 48 | f"RPCInstance({self.rpc.name}, {self.iteration_id}, " 49 | f"{self.parents}, {self.children})" 50 | ) 51 | 52 | def __hash__(self): 53 | return hash((self.rpc.name, self.iteration_id)) 54 | -------------------------------------------------------------------------------- /realhf/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/realhf/apps/__init__.py -------------------------------------------------------------------------------- /realhf/apps/profile_layers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import time 4 | 5 | import realhf.base.testing as testing 6 | 7 | BATCH_SIZE_RANGE = [1, 2, 4, 8, 16, 32, 64, 128] 8 | SEQ_LEN_RANGE = [128, 256, 512] 9 | 10 | 11 | def profile_layer_func( 12 | world_size, 13 | model_path, 14 | model_name, 15 | warm_up_rounds, 16 | profile_rounds, 17 | batch_size_range, 18 | seq_len_range, 19 | use_sequence_parallel=False, 20 | use_gradient_checkpointing=False, 21 | ): 22 | # FIXME: use_sequence_parallel=True and use_gradient_checkpointing=True will cause bugs 23 | import torch 24 | 25 | import realhf.base.constants as constants 26 | 27 | testing.init_global_constants( 28 | 1, world_size, 1, sequence_parallel=False, gradient_checkpointing=False 29 | ) 30 | device = torch.device("cuda") 31 | with constants.model_scope(testing.MODEL_NAME): 32 | from realhf.search_engine.layers import make_profile_layers 33 | 34 | profile_layers = make_profile_layers(device, model_path, model_name) 35 | 36 | st = time.monotonic_ns() 37 | for i in range(warm_up_rounds + profile_rounds): 38 | for bs, seq_len in itertools.product(batch_size_range, seq_len_range): 39 | profile_layers.fwd_gen(bs, seq_len) 40 | profile_layers.fwd_bwd_opt(bs, seq_len) 41 | 42 | if i < warm_up_rounds: 43 | profile_layers.reset_stats() 44 | profile_layers.make_dataframe_and_print() 45 | profile_layers.dump_stats(world_size) 46 | t = (time.monotonic_ns() - st) / int(1e9) 47 | print(f"profile world size {world_size} cost {t:4f} seconds") 48 | 49 | 50 | if __name__ == "__main__": 51 | st = time.monotonic_ns() 52 | parser = argparse.ArgumentParser(prog="profile_layers") 53 | parser.add_argument( 54 | "--model_path", 55 | type=str, 56 | required=True, 57 | ) 58 | parser.add_argument("--expr_name", type=str, default="profile") 59 | parser.add_argument("--trial_name", type=str, default="profile") 60 | parser.add_argument("--model_name", type=str, default="Llama-2-70b") 61 | parser.add_argument("--warm_up_rounds", type=int, default=1) 62 | parser.add_argument("--profile_rounds", type=int, default=3) 63 | # parser.add_argument("--use_sequence_parallel", action="store_true") 64 | # parser.add_argument("--use_gradient_checkpointing", action="store_true") 65 | args = parser.parse_args() 66 | 67 | world_sizes = [1, 2, 4, 8] 68 | 69 | for world_size in world_sizes: 70 | testing.clear_name_resolve(args.expr_name, args.trial_name) 71 | mp = testing.LocalMultiProcessTest( 72 | world_size, 73 | profile_layer_func, 74 | world_size, 75 | args.model_path, 76 | args.model_name, 77 | args.warm_up_rounds, 78 | args.profile_rounds, 79 | BATCH_SIZE_RANGE, 80 | SEQ_LEN_RANGE, 81 | expr_name=args.expr_name, 82 | trial_name=args.trial_name, 83 | ) 84 | mp.launch() 85 | 86 | t = (time.monotonic_ns() - st) / int(1e9) 87 | print(f"profile model {args.model_name} time cost {t:4f} seconds") 88 | -------------------------------------------------------------------------------- /realhf/apps/quickstart.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import getpass 4 | import pathlib 5 | import re 6 | import sys 7 | 8 | import hydra 9 | 10 | from realhf.api.quickstart.entrypoint import QUICKSTART_FN 11 | from realhf.base.cluster import spec as cluster_spec 12 | from realhf.base.importing import import_module 13 | 14 | # NOTE: Register all implemented experiments inside ReaL. 15 | import_module( 16 | str(pathlib.Path(__file__).resolve().parent.parent / "experiments" / "common"), 17 | re.compile(r".*_exp\.py$"), 18 | ) 19 | import realhf.experiments.benchmark.profile_exp 20 | 21 | 22 | def main(): 23 | parser = argparse.ArgumentParser(prog="ReaL Quickstart") 24 | subparsers = parser.add_subparsers(dest="cmd", help="sub-command help") 25 | subparsers.required = True 26 | for k, v in QUICKSTART_FN.items(): 27 | subparser = subparsers.add_parser(k) 28 | subparser.set_defaults(func=v) 29 | args = parser.parse_known_args()[0] 30 | 31 | launch_hydra_task(args.cmd, QUICKSTART_FN[args.cmd]) 32 | 33 | 34 | def launch_hydra_task(name: str, func: hydra.TaskFunction): 35 | # Disable hydra logging. 36 | if not any("hydra/job_logging=disabled" in x for x in sys.argv): 37 | sys.argv += ["hydra/job_logging=disabled"] 38 | 39 | if any("experiment_name=" in x for x in sys.argv): 40 | experiment_name = next(x for x in sys.argv if "experiment_name=" in x).split( 41 | "=" 42 | )[1] 43 | if "_" in experiment_name: 44 | raise RuntimeError("experiment_name should not contain `_`.") 45 | else: 46 | experiment_name = f"quickstart-{name}" 47 | print(f"Experiment name not manually set. Default to {experiment_name}.") 48 | sys.argv += [f"experiment_name={experiment_name}"] 49 | 50 | if ( 51 | "--multirun" in sys.argv 52 | or "hydra.mode=MULTIRUN" in sys.argv 53 | or "-m" in sys.argv 54 | ): 55 | raise NotImplementedError("Hydra multi-run is not supported.") 56 | # non-multirun mode, add trial_name and hydra run dir 57 | if any("trial_name=" in x for x in sys.argv): 58 | trial_name = next(x for x in sys.argv if "trial_name=" in x).split("=")[1] 59 | else: 60 | trial_name = f"run{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" 61 | sys.argv += [f"trial_name={trial_name}"] 62 | if "_" in trial_name: 63 | raise RuntimeError("trial_name should not contain `_`.") 64 | sys.argv += [ 65 | f"hydra.run.dir={cluster_spec.fileroot}/logs/{getpass.getuser()}/" 66 | f"{experiment_name}/{trial_name}/hydra-outputs/" 67 | ] 68 | 69 | sys.argv.pop(1) 70 | 71 | func() 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /realhf/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/realhf/base/__init__.py -------------------------------------------------------------------------------- /realhf/base/asyncio_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import dataclasses 3 | import sys 4 | import threading 5 | from asyncio.base_events import _run_until_complete_cb 6 | 7 | 8 | @dataclasses.dataclass 9 | class AsyncRunUntilCompleteContext: 10 | loop: asyncio.BaseEventLoop 11 | future: asyncio.Future 12 | new_task: bool 13 | 14 | 15 | def setup_run_until_complete( 16 | loop: asyncio.BaseEventLoop, 17 | future: asyncio.Future, 18 | ) -> AsyncRunUntilCompleteContext: 19 | loop._check_closed() 20 | loop._check_running() 21 | 22 | new_task = not asyncio.futures.isfuture(future) 23 | future = asyncio.tasks.ensure_future(future, loop=loop) 24 | if new_task: 25 | # An exception is raised if the future didn't complete, so there 26 | # is no need to log the "destroy pending task" message 27 | future._log_destroy_pending = False 28 | 29 | future.add_done_callback(_run_until_complete_cb) 30 | 31 | # set up run forever 32 | loop._set_coroutine_origin_tracking(loop._debug) 33 | 34 | loop._old_agen_hooks = sys.get_asyncgen_hooks() 35 | loop._thread_id = threading.get_ident() 36 | sys.set_asyncgen_hooks( 37 | firstiter=loop._asyncgen_firstiter_hook, 38 | finalizer=loop._asyncgen_finalizer_hook, 39 | ) 40 | asyncio.events._set_running_loop(loop) 41 | return AsyncRunUntilCompleteContext(loop=loop, future=future, new_task=new_task) 42 | 43 | 44 | def teardown_run_util_complete(ctx: AsyncRunUntilCompleteContext): 45 | ctx.loop._stopping = False 46 | ctx.loop._thread_id = None 47 | asyncio.events._set_running_loop(None) 48 | ctx.loop._set_coroutine_origin_tracking(False) 49 | # Restore any pre-existing async generator hooks. 50 | if ctx.loop._old_agen_hooks is not None: 51 | sys.set_asyncgen_hooks(*ctx.loop._old_agen_hooks) 52 | ctx.loop._old_agen_hooks = None 53 | 54 | ctx.future.remove_done_callback(_run_until_complete_cb) 55 | 56 | if not ctx.future.done(): 57 | raise RuntimeError("Event loop stopped before Future completed.") 58 | 59 | 60 | def raise_asyncio_exception( 61 | ctx: AsyncRunUntilCompleteContext, raise_error: bool = True 62 | ): 63 | if ctx.new_task and ctx.future.done() and not ctx.future.cancelled(): 64 | # The coroutine raised a BaseException. Consume the exception 65 | # to not log a warning, the caller doesn't have access to the 66 | # local task. 67 | ctx.future.exception() 68 | 69 | try: 70 | teardown_run_util_complete(ctx) 71 | except RuntimeError as e: 72 | if raise_error: 73 | raise e 74 | 75 | if raise_error: 76 | raise 77 | -------------------------------------------------------------------------------- /realhf/base/cluster.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import json 3 | import os 4 | import re 5 | from typing import Dict, List, Optional, Union 6 | 7 | CLUSTER_SPEC_PATH = os.environ.get("CLUSTER_SPEC_PATH", "") 8 | 9 | 10 | def get_user_tmp(): 11 | user = getpass.getuser() 12 | user_tmp = os.path.join("/home", user, ".cache", "realhf") 13 | os.makedirs(user_tmp, exist_ok=True) 14 | return user_tmp 15 | 16 | 17 | class ClusterSpec: 18 | def __init__(self): 19 | self.__loaded = False 20 | 21 | def load_spec_from_file(self, file_path: str): 22 | try: 23 | with open(file_path, "r") as f: 24 | spec: Dict = json.load(f) 25 | except FileNotFoundError: 26 | if file_path == "": 27 | spec = dict( 28 | cluster_type="local", 29 | cluster_name="local", 30 | fileroot=get_user_tmp(), 31 | ) 32 | else: 33 | raise FileNotFoundError(f"Cluster spec file not found: {file_path}") 34 | 35 | self.__cluster_type = spec["cluster_type"] 36 | self.__cluster_name = spec["cluster_name"] 37 | self.__fileroot = spec["fileroot"] 38 | self.__node_type_from_node_name_re = spec.get("node_type_from_node_name", None) 39 | self.__gpu_type_from_node_name_re = spec.get("gpu_type_from_node_name", None) 40 | self.__default_mount = spec.get("default_mount", None) 41 | self.__gpu_image = spec.get("gpu_image", None) 42 | self.__cpu_image = spec.get("cpu_image", None) 43 | self.__node_name_prefix = spec.get("node_name_prefix", "NODE") 44 | 45 | self.__loaded = True 46 | 47 | @property 48 | def name(self): 49 | assert self.__loaded 50 | return self.__cluster_name 51 | 52 | def node_type_from_node_name(self, node_name: str) -> str: 53 | """Mapping nodename to slurm node type, including "g1", "g2", "g8", 54 | "a100".""" 55 | if self.__cluster_type != "slurm": 56 | raise NotImplementedError( 57 | "Only slurm cluster uses node_type_from_node_name." 58 | ) 59 | assert self.__loaded 60 | for regex, node_type in self.__node_type_from_node_name_re.items(): 61 | if re.match(regex, node_name): 62 | return node_type 63 | raise NotImplementedError() 64 | 65 | def gpu_type_from_node_name(self, node_name: str) -> str: 66 | """Mapping nodename to slurm GPU type, including "geforce" and 67 | "tesla".""" 68 | if self.__cluster_type != "slurm": 69 | raise NotImplementedError( 70 | "Only slurm cluster uses gpu_type_from_node_name." 71 | ) 72 | assert self.__loaded 73 | for regex, gpu_type in self.__gpu_type_from_node_name_re.items(): 74 | if re.match(regex, node_name): 75 | return gpu_type 76 | raise NotImplementedError() 77 | 78 | @property 79 | def fileroot(self) -> str: 80 | """Return the root directory of the file system in the cluster. 81 | 82 | When running experiments, files such as logs, checkpoints, 83 | caches will be saved under this directory. 84 | """ 85 | assert self.__loaded 86 | return self.__fileroot 87 | 88 | @property 89 | def default_mount(self) -> str: 90 | """Directories that should be mounted to container that runs 91 | workers.""" 92 | assert self.__loaded 93 | return self.__default_mount 94 | 95 | @property 96 | def gpu_image(self) -> str: 97 | """Return the default image for containers of GPU workers.""" 98 | assert self.__loaded 99 | return self.__gpu_image 100 | 101 | @property 102 | def cpu_image(self) -> str: 103 | """Return the default image for containers of CPU workers.""" 104 | assert self.__loaded 105 | return self.__cpu_image 106 | 107 | @property 108 | def node_name_prefix(self) -> str: 109 | """Return the prefix of node names in slurm format.""" 110 | assert self.__loaded 111 | return self.__node_name_prefix 112 | 113 | 114 | def node_name_is_node_type( 115 | node_name: str, node_type: Optional[Union[List[str], str]] = None 116 | ) -> bool: 117 | assert spec is not None 118 | if node_type is None: 119 | return True 120 | if not isinstance(node_type, list): 121 | node_type = [node_type] 122 | nt_condition = [] 123 | for nt in node_type: 124 | if nt not in ["g1", "g2", "g8", "a100"]: 125 | raise ValueError(f"Unknown node type {nt}.") 126 | else: 127 | cond = spec.node_type_from_node_name(node_name) == nt 128 | nt_condition.append(cond) 129 | return any(nt_condition) 130 | 131 | 132 | spec = ClusterSpec() 133 | spec.load_spec_from_file(CLUSTER_SPEC_PATH) 134 | -------------------------------------------------------------------------------- /realhf/base/importing.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import importlib.util 3 | import os 4 | import re 5 | import sys 6 | from pathlib import Path 7 | 8 | from .logging import getLogger 9 | 10 | logger = getLogger("importing") 11 | 12 | 13 | def import_module(path: str, pattern: re.Pattern): 14 | dirname = Path(path) 15 | for x in os.listdir(dirname.absolute()): 16 | if not pattern.match(x): 17 | continue 18 | module_path = os.path.splitext(os.path.join(dirname, x))[0] 19 | assert "realhf" in module_path 20 | start_idx = path.rindex("realhf") 21 | module_path = module_path[start_idx:] 22 | module_path = "realhf." + module_path.replace(os.sep, ".").replace( 23 | "realhf.", "" 24 | ) 25 | # logger.info(f"Automatically importing module {module_path}.") 26 | importlib.import_module(module_path) 27 | 28 | 29 | def import_usercode(module_path: str, module_name: str): 30 | # Create a module spec 31 | spec = importlib.util.spec_from_file_location(module_name, module_path) 32 | # Create a module object 33 | module = importlib.util.module_from_spec(spec) 34 | # Add the module to sys.modules 35 | sys.modules[module_name] = module 36 | # Execute the module in its own namespace 37 | spec.loader.exec_module(module) 38 | -------------------------------------------------------------------------------- /realhf/base/names.py: -------------------------------------------------------------------------------- 1 | # This file standardizes the name-resolve names used by different components of the system. 2 | import getpass 3 | 4 | USER_NAMESPACE = getpass.getuser() 5 | 6 | 7 | def registry_root(user): 8 | return f"trial_registry/{user}" 9 | 10 | 11 | def trial_registry(experiment_name, trial_name): 12 | return f"trial_registry/{USER_NAMESPACE}/{experiment_name}/{trial_name}" 13 | 14 | 15 | def trial_root(experiment_name, trial_name): 16 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}" 17 | 18 | 19 | def worker_status(experiment_name, trial_name, worker_name): 20 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/status/{worker_name}" 21 | 22 | 23 | def worker_root(experiment_name, trial_name): 24 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/worker/" 25 | 26 | 27 | def worker(experiment_name, trial_name, worker_name): 28 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/worker/{worker_name}" 29 | 30 | 31 | def worker_key(experiment_name, trial_name, key): 32 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/worker_key/{key}" 33 | 34 | 35 | def request_reply_stream(experiment_name, trial_name, stream_name): 36 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/request_reply_stream/{stream_name}" 37 | 38 | 39 | def request_reply_stream_root(experiment_name, trial_name): 40 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/request_reply_stream/" 41 | 42 | 43 | def distributed_root(experiment_name, trial_name): 44 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/" 45 | 46 | 47 | def distributed_peer(experiment_name, trial_name, model_name): 48 | return ( 49 | f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/peer/{model_name}" 50 | ) 51 | 52 | 53 | def distributed_local_peer(experiment_name, trial_name, host_name, model_name): 54 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/local_peer/{host_name}/{model_name}" 55 | 56 | 57 | def distributed_master(experiment_name, trial_name, model_name): 58 | return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/master/{model_name}" 59 | -------------------------------------------------------------------------------- /realhf/base/network.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from contextlib import closing 3 | 4 | 5 | def find_free_port(): 6 | """From, stackoverflow Issue 1365265.""" 7 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 8 | s.bind(("", 0)) 9 | s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 10 | return s.getsockname()[1] 11 | 12 | 13 | def gethostname(): 14 | return socket.gethostname() 15 | 16 | 17 | def gethostip(): 18 | return socket.gethostbyname(socket.gethostname()) 19 | -------------------------------------------------------------------------------- /realhf/base/numpy_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | import numpy as np 4 | 5 | 6 | def shape_leq(shape1: Tuple, shape2: Tuple) -> bool: 7 | assert len(shape1) == len(shape2) 8 | return all(x1 <= x2 for x1, x2 in zip(shape1, shape2)) 9 | 10 | 11 | def shape_union(*shapes: List[Tuple]) -> Tuple: 12 | if len(shapes) == 1: 13 | return shapes[0] 14 | for s in shapes: 15 | assert len(s) == len(shapes[0]) 16 | return tuple(max(*dims) for dims in zip(*shapes)) 17 | 18 | 19 | def split_to_shapes(x: np.ndarray, shapes: Dict, axis: int = -1): 20 | """Split an array and reshape to desired shapes. 21 | 22 | Args: 23 | x (np.ndarray): The array to be splitted 24 | shapes (Dict): Dict of shapes (tuples) specifying how to split. 25 | axis (int): Split dimension. 26 | 27 | Returns: 28 | List: Splitted observations. 29 | """ 30 | axis = len(x.shape) + axis if axis < 0 else axis 31 | split_lengths = [np.prod(shape) for shape in shapes.values()] 32 | assert x.shape[axis] == sum(split_lengths) 33 | accum_split_lengths = [sum(split_lengths[:i]) for i in range(1, len(split_lengths))] 34 | splitted_x = np.split(x, accum_split_lengths, axis) 35 | return { 36 | k: x.reshape(*x.shape[:axis], *shape, *x.shape[axis + 1 :]) 37 | for x, (k, shape) in zip(splitted_x, shapes.items()) 38 | } 39 | -------------------------------------------------------------------------------- /realhf/base/ray_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | 5 | def check_ray_availability(): 6 | return ( 7 | int( 8 | subprocess.run( 9 | ["ray", "--help"], 10 | stdout=open(os.devnull, "wb"), 11 | stderr=open(os.devnull, "wb"), 12 | ).returncode 13 | ) 14 | == 0 15 | ) 16 | -------------------------------------------------------------------------------- /realhf/base/recover.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os 3 | import pickle 4 | from typing import Optional, Set 5 | 6 | import realhf.base.constants as constants 7 | 8 | RECOVER_INFO_PATH = None 9 | 10 | 11 | @dataclasses.dataclass 12 | class StepInfo: 13 | epoch: int 14 | epoch_step: int 15 | global_step: int 16 | 17 | 18 | @dataclasses.dataclass 19 | class RecoverInfo: 20 | recover_start: StepInfo 21 | last_step_info: StepInfo 22 | hash_vals_to_ignore: Set[int] = dataclasses.field(default_factory=set) 23 | 24 | 25 | def dump_recover_info(recover_info: RecoverInfo): 26 | global RECOVER_INFO_PATH 27 | if RECOVER_INFO_PATH is None: 28 | RECOVER_INFO_PATH = os.path.join( 29 | constants.RECOVER_ROOT, 30 | constants.experiment_name(), 31 | constants.trial_name(), 32 | "recover_info.pkl", 33 | ) 34 | with open(RECOVER_INFO_PATH, "wb") as f: 35 | pickle.dump(recover_info, f) 36 | 37 | 38 | def load_recover_info() -> Optional[RecoverInfo]: 39 | global RECOVER_INFO_PATH 40 | if RECOVER_INFO_PATH is None: 41 | RECOVER_INFO_PATH = os.path.join( 42 | constants.RECOVER_ROOT, 43 | constants.experiment_name(), 44 | constants.trial_name(), 45 | "recover_info.pkl", 46 | ) 47 | try: 48 | with open(RECOVER_INFO_PATH, "rb") as f: 49 | return pickle.load(f) 50 | except FileNotFoundError: 51 | raise FileNotFoundError( 52 | f"Resume info not found at {RECOVER_INFO_PATH}. " 53 | f"This should not be a resumed experiment!" 54 | ) 55 | -------------------------------------------------------------------------------- /realhf/base/saveload_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from typing import Dict 4 | 5 | import torch 6 | import tqdm 7 | from safetensors import safe_open 8 | 9 | from realhf.base import logging 10 | 11 | logger = logging.getLogger("SaveLoad") 12 | 13 | 14 | def split_state_dict_into_shards(state_dict: Dict, n_shards: int) -> Dict: 15 | if n_shards == 1: 16 | return [state_dict] 17 | 18 | keys = list(state_dict.keys()) 19 | if len(keys) < n_shards: 20 | raise ValueError(f"state_dict has {len(keys)} keys, but n_shards={n_shards}") 21 | 22 | shard_size = len(keys) // n_shards 23 | extra = len(keys) % n_shards 24 | shard_size_list = [shard_size for _ in range(n_shards)] 25 | shard_size_list[-1] = shard_size + extra 26 | start, shards = 0, [] 27 | for i, size in enumerate( 28 | tqdm.tqdm( 29 | shard_size_list, 30 | desc=f"Splitting state dict into {len(shard_size_list)} shards...", 31 | ) 32 | ): 33 | shard = {} 34 | for j in range(start, start + size): 35 | shard[keys[j]] = state_dict[keys[j]] 36 | start += size 37 | shards.append(shard) 38 | return shards 39 | 40 | 41 | HF_MODEL_CONFIG_FILES = [ 42 | "config.json", 43 | "generation_config.json", 44 | "tokenizer_config.json", 45 | "vocab.json", 46 | "merges.txt", 47 | "special_tokens_map.json", 48 | "tokenizer.json", 49 | ] 50 | 51 | 52 | def copy_hf_configs(src_model_dir, dst_model_dir): 53 | for file in HF_MODEL_CONFIG_FILES: 54 | try: 55 | shutil.copy( 56 | os.path.join(src_model_dir, file), 57 | os.path.join(dst_model_dir, file), 58 | ) 59 | logger.info(f"copied {file} from {src_model_dir} to {dst_model_dir}") 60 | except FileNotFoundError: 61 | logger.info(f"{file} not exist in {src_model_dir} skipping.") 62 | 63 | 64 | def load_safetensor(fn: str) -> Dict[str, torch.Tensor]: 65 | assert fn.endswith(".safetensors") 66 | state_dict = {} 67 | with safe_open(fn, framework="pt", device="cpu") as f: 68 | for key in f.keys(): 69 | state_dict[key] = f.get_tensor(key) 70 | return state_dict 71 | -------------------------------------------------------------------------------- /realhf/base/security.py: -------------------------------------------------------------------------------- 1 | def read_key(service, name="default"): 2 | with open(f"/data/marl/keys/{service}/{name}", "r") as f: 3 | return f.read().strip() 4 | -------------------------------------------------------------------------------- /realhf/base/seeding.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | import transformers 6 | 7 | 8 | def set_random_seed(seed): 9 | transformers.set_seed(seed) 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | if torch.cuda.is_available(): 14 | torch.cuda.manual_seed_all(seed) 15 | -------------------------------------------------------------------------------- /realhf/base/slurm_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import subprocess 4 | from typing import List 5 | 6 | import numpy as np 7 | 8 | 9 | def parse_node_id(node_name: str, prefix: str) -> int: 10 | return int(node_name.split(prefix)[-1]) 11 | 12 | 13 | def parse_nodelist(nodelist: str, prefix: str) -> List[str]: 14 | if not nodelist.startswith(prefix): 15 | raise ValueError( 16 | f"Node list `{nodelist}` does not start with hostname prefix `{prefix}`." 17 | ) 18 | nodelist = nodelist.replace(prefix, "") 19 | if "[" not in nodelist: 20 | return [prefix + nodelist] 21 | else: 22 | nodelist = nodelist.strip("[]") 23 | node_ids = [] 24 | nodelist = nodelist.split(",") 25 | for node_repr in nodelist: 26 | if "-" not in node_repr: 27 | node_ids.append(int(node_repr)) 28 | else: 29 | start, end = map(int, node_repr.split("-")) 30 | node_ids += list(range(start, end + 1)) 31 | return [f"{prefix}{node_id:02d}" for node_id in node_ids] 32 | 33 | 34 | def nodelist_from_nodes(nodes: List[str], prefix: str) -> str: 35 | node_ids = sorted([parse_node_id(node, prefix) for node in nodes]) 36 | assert len(node_ids) > 0 37 | if len(node_ids) == 1: 38 | return f"{prefix}{node_ids[0]:02d}" 39 | else: 40 | node_reprs = [] 41 | start, end = node_ids[0], node_ids[0] 42 | for i in range(len(node_ids)): 43 | node_id = node_ids[i] 44 | next_node_id = node_ids[i + 1] if i + 1 < len(node_ids) else -1 45 | if node_id + 1 == next_node_id: 46 | end = next_node_id 47 | else: 48 | if start == end: 49 | node_reprs.append(f"{start:02d}") 50 | else: 51 | node_reprs.append(f"{start:02d}-{end:02d}") 52 | start = next_node_id 53 | end = next_node_id 54 | return f"{prefix}[{','.join(node_reprs)}]" 55 | 56 | 57 | def are_ones_contiguous(binary_array: np.ndarray): 58 | one_indices = np.where(binary_array == 1)[0] 59 | if len(one_indices) == 0: 60 | return False 61 | return np.all(np.diff(one_indices) == 1) 62 | 63 | 64 | def slurm_hostname_key(hostname): 65 | """Custom sorting key function to sort Slurm hostnames.""" 66 | # Extract node number from hostname 67 | match = re.match(r"(\D+)(\d+)", hostname) 68 | if match: 69 | prefix, number = match.groups() 70 | return (prefix, int(number)) 71 | else: 72 | return (hostname,) 73 | 74 | 75 | def check_slurm_availability(): 76 | 77 | slurm_available = ( 78 | int( 79 | subprocess.run( 80 | "squeue", 81 | shell=True, 82 | stdout=open(os.devnull, "wb"), 83 | stderr=open(os.devnull, "wb"), 84 | ).returncode 85 | ) 86 | == 0 87 | ) 88 | return slurm_available 89 | -------------------------------------------------------------------------------- /realhf/experiments/common/check.py: -------------------------------------------------------------------------------- 1 | import realhf.api.core.model_api as model_api 2 | 3 | 4 | def check_is_realhf_native_impl(_cls): 5 | return _cls.__module__.startswith("realhf") 6 | 7 | 8 | def check_is_realhf_native_model_interface(name): 9 | # NOTE: we should not use auto-importing here, 10 | # because the user may write customized interfaces under this folder. 11 | import realhf.impl.model.interface.dpo_interface 12 | import realhf.impl.model.interface.gen_interface 13 | import realhf.impl.model.interface.ppo_interface 14 | import realhf.impl.model.interface.rw_interface 15 | import realhf.impl.model.interface.sft_interface 16 | 17 | _cls = model_api.ALL_INTERFACE_CLASSES.get(name) 18 | return _cls and check_is_realhf_native_impl(_cls) 19 | -------------------------------------------------------------------------------- /realhf/experiments/common/rw_exp.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import List, Optional 3 | 4 | from realhf.api.core.config import ( 5 | DataLoaderAbstraction, 6 | DatasetAbstraction, 7 | ModelInterfaceAbstraction, 8 | ModelInterfaceType, 9 | ModelName, 10 | ) 11 | from realhf.api.core.dfg import MFCDef 12 | from realhf.api.quickstart.dataset import PairedComparisonDatasetConfig 13 | from realhf.api.quickstart.device_mesh import MFCConfig 14 | from realhf.api.quickstart.entrypoint import register_quickstart_exp 15 | from realhf.api.quickstart.model import ModelTrainEvalConfig 16 | from realhf.experiments.common.common import CommonExperimentConfig 17 | 18 | 19 | @dataclasses.dataclass 20 | class RWConfig(CommonExperimentConfig): 21 | """Configuration for pairwise reward modeling experiments. 22 | 23 | This class is a subclass of :class:`CommonExperimentConfig`, 24 | so all CLI options from the base class are available. 25 | 26 | :param is_sft_lora: Whether LoRA was used for SFT. 27 | If LoRA was used, the saved SFT model should only contain LoRA parameters. 28 | Since LoRA is currently not supported for SFT, this option is not utilized at present. 29 | :type is_sft_lora: bool 30 | :param sft_lora_path: Path to the LoRA model for SFT. 31 | Since LoRA is currently not supported for SFT, this option is not utilized at present. 32 | :type sft_lora_path: str or None 33 | :param model: Configuration for model runtime. 34 | :type model: ModelTrainEvalConfig 35 | :param allocation: Configuration for device allocation and parallelism. 36 | :type allocation: MFCConfig 37 | :param dataset: Configuration for the dataset. 38 | :type dataset: PairedComparisonDatasetConfig 39 | """ 40 | 41 | is_sft_lora: bool = False 42 | sft_lora_path: Optional[str] = None 43 | model: ModelTrainEvalConfig = dataclasses.field( 44 | default_factory=ModelTrainEvalConfig 45 | ) 46 | allocation: MFCConfig = dataclasses.field(default_factory=MFCConfig) 47 | 48 | dataset: PairedComparisonDatasetConfig = dataclasses.field( 49 | default_factory=PairedComparisonDatasetConfig 50 | ) 51 | 52 | def __post_init__(self): 53 | assert ( 54 | not self.is_sft_lora and self.sft_lora_path is None 55 | ), "LoRA is not supported for now." 56 | self.model.init_critic_from_actor = True 57 | 58 | @property 59 | def models(self): 60 | return { 61 | "default": self.model, 62 | } 63 | 64 | @property 65 | def rpcs(self): 66 | interface = ModelInterfaceAbstraction("paired_rw") 67 | rpc = MFCDef( 68 | name="rwTrain", 69 | n_mbs=self.allocation.n_mbs, 70 | model_name=ModelName("default", 0), 71 | interface_type=ModelInterfaceType.TRAIN_STEP, 72 | interface_impl=interface, 73 | model_type=self.model.type, 74 | model_path=self.model.path, 75 | input_keys=["packed_input_ids"], 76 | log_return_value=True, 77 | n_seqs=self.dataset.train_bs_n_seqs, 78 | ) 79 | return {"rwTrain": rpc} 80 | 81 | @property 82 | def allocations(self): 83 | return {"rwTrain": self.allocation} 84 | 85 | @property 86 | def datasets(self): 87 | return [ 88 | DatasetAbstraction( 89 | "rw_pair", 90 | args=dict( 91 | max_length=self.dataset.max_seqlen, 92 | max_pairs_per_prompt=self.dataset.max_pairs_per_prompt, 93 | dataset_path=self.dataset.train_path, 94 | ), 95 | ) 96 | ] 97 | 98 | @property 99 | def eval_datasets(self): 100 | return [ 101 | DatasetAbstraction( 102 | "rw_pair", 103 | args=dict( 104 | max_length=self.dataset.max_seqlen, 105 | max_pairs_per_prompt=self.dataset.max_pairs_per_prompt, 106 | dataset_path=self.dataset.valid_path, 107 | ), 108 | ) 109 | ] 110 | 111 | @property 112 | def eval_dataloader(self): 113 | return DataLoaderAbstraction( 114 | "packed_eval", args=dict(batch_size=self.dataset.valid_bs_n_seqs) 115 | ) 116 | 117 | @property 118 | def tokenizer_name_or_path(self): 119 | return self.model.path 120 | 121 | 122 | register_quickstart_exp("rw", RWConfig) 123 | -------------------------------------------------------------------------------- /realhf/experiments/common/sft_exp.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | 3 | from realhf.api.core.config import ( 4 | DataLoaderAbstraction, 5 | DatasetAbstraction, 6 | ModelInterfaceAbstraction, 7 | ModelInterfaceType, 8 | ModelName, 9 | ) 10 | from realhf.api.core.dfg import MFCDef 11 | from realhf.api.quickstart.dataset import PromptAnswerDatasetConfig 12 | from realhf.api.quickstart.device_mesh import MFCConfig 13 | from realhf.api.quickstart.entrypoint import register_quickstart_exp 14 | from realhf.api.quickstart.model import ModelTrainEvalConfig 15 | from realhf.experiments.common.common import CommonExperimentConfig 16 | 17 | 18 | @dataclasses.dataclass 19 | class SFTConfig(CommonExperimentConfig): 20 | """Configuration for SFT experiments. 21 | 22 | This class is a subclass of :class:`CommonExperimentConfig`, 23 | so all CLI options from the base class are available. 24 | 25 | :param model: Configuration for model runtime. 26 | :type model: ModelTrainEvalConfig 27 | :param allocation: Configuration for device allocation and parallelism. 28 | :type allocation: MFCConfig 29 | :param dataset: Configuration for the dataset. 30 | :type dataset: PromptAnswerDatasetConfig 31 | """ 32 | 33 | model: ModelTrainEvalConfig = dataclasses.field( 34 | default_factory=ModelTrainEvalConfig 35 | ) 36 | allocation: MFCConfig = dataclasses.field(default_factory=MFCConfig) 37 | dataset: PromptAnswerDatasetConfig = dataclasses.field( 38 | default_factory=PromptAnswerDatasetConfig 39 | ) 40 | 41 | @property 42 | def models(self): 43 | return { 44 | "default": self.model, 45 | } 46 | 47 | @property 48 | def rpcs(self): 49 | rpc = MFCDef( 50 | n_seqs=self.dataset.train_bs_n_seqs, 51 | name="trainDefault", 52 | n_mbs=self.allocation.n_mbs, 53 | interface_type=ModelInterfaceType.TRAIN_STEP, 54 | interface_impl=ModelInterfaceAbstraction("sft"), 55 | model_name="default", 56 | input_keys=["packed_input_ids", "prompt_mask"], 57 | log_return_value=True, 58 | model_type=self.model.type, 59 | model_path=self.model.path, 60 | ) 61 | return {"trainDefault": rpc} 62 | 63 | @property 64 | def allocations(self): 65 | return {"trainDefault": self.allocation} 66 | 67 | @property 68 | def datasets(self): 69 | return [ 70 | DatasetAbstraction( 71 | "prompt_answer", 72 | args=dict( 73 | max_length=self.dataset.max_seqlen, 74 | dataset_path=self.dataset.train_path, 75 | pad_to_max_length=self.dataset.pad_to_max_length, 76 | ), 77 | ) 78 | ] 79 | 80 | @property 81 | def eval_datasets(self): 82 | return [ 83 | DatasetAbstraction( 84 | "prompt_answer", 85 | args=dict( 86 | max_length=self.dataset.max_seqlen, 87 | dataset_path=self.dataset.valid_path, 88 | ), 89 | ) 90 | ] 91 | 92 | @property 93 | def eval_dataloader(self): 94 | return DataLoaderAbstraction( 95 | "packed_eval", args=dict(batch_size=self.dataset.valid_bs_n_seqs) 96 | ) 97 | 98 | @property 99 | def tokenizer_name_or_path(self): 100 | return self.model.path 101 | 102 | 103 | register_quickstart_exp("sft", SFTConfig) 104 | -------------------------------------------------------------------------------- /realhf/impl/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from realhf.base.importing import import_module 5 | 6 | # Import all dataset implementations. 7 | _p = re.compile(r"^(?!.*__init__).*\.py$") 8 | _filepath = os.path.dirname(__file__) 9 | import_module(_filepath, _p) 10 | -------------------------------------------------------------------------------- /realhf/impl/dataset/prompt_answer_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Callable, Dict, List, Optional 3 | 4 | import numpy as np 5 | import torch 6 | import torch.utils.data 7 | 8 | from realhf.api.core import data_api 9 | from realhf.base import logging 10 | 11 | logger = logging.getLogger("Prompt Answer Dataset") 12 | 13 | 14 | class PromptAnswerDataset(torch.utils.data.Dataset): 15 | 16 | def __init__( 17 | self, 18 | util: data_api.DatasetUtility, 19 | max_length: int, 20 | dataset_path: Optional[str] = None, 21 | dataset_builder: Optional[Callable[[], List[Dict]]] = None, 22 | pad_to_max_length: bool = False, 23 | ): 24 | """A dataset with prompts and corresponding answers. Usually used for 25 | SFT. 26 | 27 | Args: 28 | util (api.data.DatasetUtility): . 29 | max_length (Optional[int], optional): The maximum length of each sequence in the batch. 30 | dataset_path (Optional[str], optional): Path to the dataset json/jsonl file. 31 | The json/jsonl file should be a list of dictionary. Each element in the list should have 32 | a key "prompt" and a key "answer". Defaults to None. 33 | dataset_builder (Optional[Callable[[], List[Dict]]], optional): Alternative to dataset_path. 34 | A callable that returns a list of dictionary. Defaults to None. 35 | pad_to_max_length (bool): Whether to pad sequences to the maximum length. 36 | Used only for benchmarking. If True, all mini-batches created by the DP balanced partition 37 | algorithm will have the same number of tokens, making MFC time predictable. Defaults to False. 38 | """ 39 | self._util = util 40 | tokenizer = self.util.tokenizer 41 | 42 | data = data_api.load_shuffle_split_dataset(util, dataset_path, dataset_builder) 43 | 44 | seqs = [x["prompt"] + x["answer"] + tokenizer.eos_token for x in data] 45 | self.ids = [x["id"] for x in data] 46 | prompts = [x["prompt"] for x in data] 47 | 48 | self.tokens = tokenizer( 49 | seqs, 50 | truncation=True, 51 | max_length=max_length, 52 | return_length=True, 53 | return_attention_mask=False, 54 | padding="max_length" if pad_to_max_length else False, 55 | ) 56 | prompt_tokens = tokenizer( 57 | prompts, 58 | padding=False, 59 | truncation=True, 60 | return_length=True, 61 | max_length=max_length, 62 | return_attention_mask=False, 63 | ) 64 | 65 | prompt_lengths = prompt_tokens["length"] 66 | seq_lengths = self.tokens["length"] 67 | prompt_masks = [] 68 | for i in range(len(self)): 69 | prompt_len = prompt_lengths[i] 70 | seqlen = self.tokens["length"][i] 71 | # seq = self.tokens["input_ids"][i] 72 | # prompt = prompt_tokens["input_ids"][i] 73 | # assert seq[:prompt_len] == prompt, (seq, prompt, prompt_len, seqlen) 74 | assert seqlen >= prompt_len, (seqlen, prompt_len) 75 | prompt_mask = [1] * prompt_len + [0] * (seqlen - prompt_len) 76 | prompt_masks.append(prompt_mask) 77 | 78 | self.prompt_masks = prompt_masks 79 | 80 | logger.info( 81 | f"Loaded Prompt Answer Dataset with INFO: " 82 | f"#seqs={len(self)}, " 83 | f"truncation length={max_length}, " 84 | f"avg prompt length={np.mean(prompt_lengths):.1f}, " 85 | f"avg answer length={np.mean(seq_lengths) - np.mean(prompt_lengths):.1f}", 86 | ) 87 | 88 | @property 89 | def util(self): 90 | return self._util 91 | 92 | def __len__(self): 93 | return len(self.tokens["input_ids"]) 94 | 95 | def __getitem__(self, idx): 96 | d = { 97 | "packed_input_ids": torch.tensor( 98 | self.tokens["input_ids"][idx], dtype=torch.long 99 | ), 100 | "prompt_mask": torch.tensor(self.prompt_masks[idx], dtype=torch.bool), 101 | } 102 | assert len(d["packed_input_ids"]) == len(d["prompt_mask"]) 103 | seqlen = [len(d["packed_input_ids"])] 104 | x = data_api.SequenceSample.from_default( 105 | ids=[self.ids[idx]], 106 | seqlens=seqlen, 107 | data=d, 108 | ) 109 | return x 110 | 111 | 112 | data_api.register_dataset("prompt_answer", PromptAnswerDataset) 113 | -------------------------------------------------------------------------------- /realhf/impl/dataset/prompt_dataset.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import Callable, Dict, List, Optional 3 | 4 | import torch.utils.data 5 | 6 | from realhf.api.core import data_api 7 | from realhf.base import logging 8 | 9 | logger = logging.getLogger("Prompt Dataset") 10 | 11 | 12 | class PromptDataset(torch.utils.data.Dataset): 13 | 14 | def __init__( 15 | self, 16 | util: data_api.DatasetUtility, 17 | max_length: Optional[int] = None, 18 | dataset_path: Optional[str] = None, 19 | dataset_builder: Optional[Callable[[], List[Dict]]] = None, 20 | pad_to_max_length: bool = False, 21 | ): 22 | """A dataset with prompts. Usually used for PPO. 23 | 24 | Args: 25 | util (api.data.DatasetUtility): . 26 | max_length (Optional[int], optional): The maximum length of each sequence in the batch. 27 | dataset_path (Optional[str], optional): Path to the dataset json/jsonl file. 28 | The json/jsonl file should be a list of dictionary. Each element in the list should have 29 | a key "prompt". Defaults to None. 30 | dataset_builder (Optional[Callable[[], List[Dict]]], optional): Alternative to dataset_path. 31 | A callable that returns a list of dictionary. Defaults to None. 32 | pad_to_max_length (bool): Whether to pad prompts to the maximum length. 33 | Used only for benchmarking. If True, all mini-batches created by the DP balanced partition 34 | algorithm will have the same number of tokens, making MFC time predictable. Defaults to False. 35 | """ 36 | self._util = util 37 | self.max_length = max_length 38 | 39 | data = data_api.load_shuffle_split_dataset(util, dataset_path, dataset_builder) 40 | 41 | prompts_str = [x["prompt"] for x in data] 42 | self.ids = [x["id"] for x in data] 43 | util.tokenizer.padding_side = "left" 44 | prompt_encodings = util.tokenizer( 45 | prompts_str, 46 | truncation=True, 47 | max_length=max_length, 48 | padding="max_length" if pad_to_max_length else False, 49 | return_length=True, 50 | return_attention_mask=False, 51 | ) 52 | 53 | self.prompt_lengths = prompt_encodings["length"] 54 | self.prompts = prompt_encodings["input_ids"] 55 | assert all(len(x) == l for x, l in zip(self.prompts, self.prompt_lengths)) 56 | 57 | logger.info(f"Number of prompts in the dataset: {len(self.prompts)}") 58 | 59 | @property 60 | def util(self): 61 | return self._util 62 | 63 | def __len__(self): 64 | return len(self.prompts) 65 | 66 | def __getitem__(self, idx): 67 | return data_api.SequenceSample.from_default( 68 | ids=[self.ids[idx]], 69 | seqlens=[self.prompt_lengths[idx]], 70 | data=dict(packed_prompts=torch.tensor(self.prompts[idx], dtype=torch.long)), 71 | metadata=dict(random_id=[uuid.uuid4()]), 72 | ) 73 | 74 | 75 | data_api.register_dataset("prompt", PromptDataset) 76 | -------------------------------------------------------------------------------- /realhf/impl/model/__init__.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | import re 4 | 5 | import torch 6 | 7 | # Import all HuggingFace model implementations. 8 | import realhf.api.from_hf 9 | import realhf.base.logging as logging 10 | from realhf.api.core.model_api import HF_MODEL_FAMILY_REGISTRY 11 | from realhf.base.importing import import_module 12 | from realhf.impl.model.conversion.hf_registry import HFModelRegistry 13 | from realhf.impl.model.nn.real_llm_api import ReaLModel 14 | 15 | logger = logging.getLogger("model init") 16 | 17 | # Import all model implementations. 18 | _p = re.compile(r"^(?!.*__init__).*\.py$") 19 | _filepath = os.path.dirname(__file__) 20 | import_module(os.path.join(_filepath, "backend"), _p) 21 | import_module(os.path.join(_filepath, "interface"), _p) 22 | import_module(os.path.join(_filepath, "nn"), _p) 23 | 24 | # Set PyTorch JIT options, following Megatron-LM. 25 | if torch.cuda.is_available(): 26 | torch._C._jit_set_profiling_executor(True) 27 | torch._C._jit_set_profiling_mode(True) 28 | torch._C._jit_override_can_fuse_on_cpu(False) 29 | torch._C._jit_override_can_fuse_on_gpu(False) 30 | torch._C._jit_set_texpr_fuser_enabled(False) 31 | # torch._C._jit_set_nvfuser_enabled(True) # disable the deprecated warning 32 | torch._C._debug_set_autodiff_subgraph_inlining(False) 33 | 34 | # Add HuggingFace hooks to ReaLModel. 35 | _HF_REGISTRIES = {} 36 | 37 | 38 | def _load_from_hf( 39 | model: ReaLModel, registry_name, load_dir: str, init_critic_from_actor: bool 40 | ): 41 | r = _HF_REGISTRIES[registry_name] 42 | setattr( 43 | model, 44 | "save_to_hf", 45 | functools.partial(_save_to_hf, model, registry_name), 46 | ) 47 | return r.load(model, load_dir, init_critic_from_actor) 48 | 49 | 50 | def _save_to_hf(model: ReaLModel, registry_name, tokenizer, save_dir: str): 51 | r = _HF_REGISTRIES[registry_name] 52 | r.save(model, tokenizer, save_dir) 53 | 54 | 55 | def _config_from_hf(registry_name, hf_config=None, model_path=None, is_critic=False): 56 | r = _HF_REGISTRIES[registry_name] 57 | return r.config_from_hf(hf_config, model_path, is_critic) 58 | 59 | 60 | def _config_to_hf(registry_name, config): 61 | r = _HF_REGISTRIES[registry_name] 62 | return r.config_to_hf(config) 63 | 64 | 65 | def _make_real_config(registry_name): 66 | r = _HF_REGISTRIES[registry_name] 67 | if r.real_config_maker is not None: 68 | return r.real_config_maker() 69 | raise NotImplementedError( 70 | f"`real_config_maker` not implemented for {registry_name}. " 71 | f"Please implement and register `real_config_maker` " 72 | f"in realhf.api.from_hf.{registry_name} to make customized ReaLModelConfig." 73 | ) 74 | 75 | 76 | for name, helpers in HF_MODEL_FAMILY_REGISTRY.items(): 77 | _HF_REGISTRIES[name] = r = HFModelRegistry(**helpers) 78 | 79 | _load_from_hf_ = functools.partialmethod(_load_from_hf, name) 80 | setattr(ReaLModel, f"from_{name}", _load_from_hf_) 81 | 82 | _save_to_hf_ = functools.partialmethod(_save_to_hf, name) 83 | setattr(ReaLModel, f"to_{name}", _save_to_hf_) 84 | 85 | _config_from_hf_ = functools.partial(_config_from_hf, name) 86 | setattr(ReaLModel, f"config_from_{name}", staticmethod(_config_from_hf_)) 87 | 88 | _config_to_hf_ = functools.partial(_config_to_hf, name) 89 | setattr(ReaLModel, f"config_to_{name}", staticmethod(_config_to_hf_)) 90 | 91 | # make a ReaLModelConfig from only parameters related to model size, used for testing 92 | _make_real_config_ = functools.partial(_make_real_config, name) 93 | setattr(ReaLModel, f"make_{name}_config", staticmethod(_make_real_config_)) 94 | -------------------------------------------------------------------------------- /realhf/impl/model/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .activations import * 2 | from .attn import * 3 | from .embedding import * 4 | from .mlp import * 5 | from .moe import * 6 | from .rms import * 7 | from .rotary import * 8 | -------------------------------------------------------------------------------- /realhf/impl/model/modules/embedding.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import init 6 | 7 | from realhf.impl.model.parallelism.model_parallel.modules import ParallelEmbedding 8 | 9 | 10 | class OffsetPositionalEmbedding(nn.Embedding): 11 | 12 | def __init__( 13 | self, 14 | num_embeddings: int, 15 | embedding_dim: int, 16 | offset: int, 17 | dtype: Optional[torch.dtype] = None, 18 | device: Optional[Union[str, torch.device]] = None, 19 | ): 20 | # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 21 | # and adjust num_embeddings appropriately. Other models don't have this hack 22 | self.__offset = offset 23 | super().__init__( 24 | num_embeddings + self.__offset, 25 | embedding_dim, 26 | dtype=dtype, 27 | device=device, 28 | ) 29 | 30 | def forward(self, position_ids: torch.LongTensor): 31 | return super().forward(position_ids + self.__offset) 32 | 33 | 34 | class OffsetParallelPositionalEmbedding(ParallelEmbedding): 35 | def __init__( 36 | self, 37 | num_embeddings: int, 38 | embedding_dim: int, 39 | offset: int, 40 | init_method=init.xavier_normal_, 41 | # params_dtype: torch.dtype=torch.float32, 42 | perform_initialization: bool = True, 43 | dtype: Optional[torch.dtype] = None, 44 | device: Optional[Union[str, torch.device]] = None, 45 | ): 46 | self.__offset = offset 47 | super(OffsetParallelPositionalEmbedding, self).__init__( 48 | num_embeddings=num_embeddings + offset, 49 | embedding_dim=embedding_dim, 50 | init_method=init_method, 51 | perform_initialization=perform_initialization, 52 | dtype=dtype, 53 | device=device, 54 | ) 55 | 56 | def forward(self, input_: torch.LongTensor) -> torch.Tensor: 57 | return super().forward(input_ + self.__offset) 58 | -------------------------------------------------------------------------------- /realhf/impl/model/modules/moe/__init__.py: -------------------------------------------------------------------------------- 1 | from .experts import * 2 | from .layer import * 3 | from .router import * 4 | from .token_dispatcher import * 5 | -------------------------------------------------------------------------------- /realhf/impl/model/modules/moe/layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # adopted from megatron 3 | from typing import Optional, Union 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | import realhf.base.constants as constants 9 | import realhf.base.logging as logging 10 | from realhf.api.core.model_api import ReaLModelConfig 11 | from realhf.impl.model.modules.mlp import GemmaRMSNorm, LlamaRMSNorm 12 | from realhf.impl.model.modules.moe.experts import GroupedMLP, SequentialMLP 13 | from realhf.impl.model.modules.moe.router import TopKRouter 14 | from realhf.impl.model.modules.moe.token_dispatcher import MoETokenDispatcher 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class LayerNormMoELayer(torch.nn.Module): 20 | 21 | def __init__( 22 | self, 23 | config: ReaLModelConfig, 24 | layer_idx: int, 25 | dtype: Optional[torch.dtype] = None, 26 | device: Optional[Union[str, torch.device]] = None, 27 | ): 28 | super(LayerNormMoELayer, self).__init__() 29 | 30 | self.config = config 31 | self.dtype = dtype 32 | self.device = device 33 | self.num_experts = self.config.moe.num_experts 34 | 35 | if config.layer_norm_type is None: 36 | layer_norm_fn = nn.LayerNorm 37 | elif config.layer_norm_type == "rms": 38 | layer_norm_fn = LlamaRMSNorm 39 | elif config.layer_norm_type == "gemma": 40 | layer_norm_fn = GemmaRMSNorm 41 | self.ln = layer_norm_fn( 42 | config.hidden_dim, eps=config.layer_norm_epsilon, dtype=dtype, device=device 43 | ) 44 | 45 | self.router = TopKRouter(config=self.config, layer_idx=layer_idx) 46 | self.token_dispatcher = MoETokenDispatcher(config=self.config) 47 | if config.moe.use_grouped_gemm and dtype == torch.bfloat16: 48 | self.experts = GroupedMLP(self.config, dtype=dtype, device=device) 49 | else: 50 | if config.moe.use_grouped_gemm: 51 | logger.warning( 52 | "GroupedGemm only supports bfloat16. Fallback to SequentialMLP." 53 | ) 54 | self.experts = SequentialMLP(self.config, dtype=dtype, device=device) 55 | 56 | def forward(self, hidden_states: torch.Tensor): 57 | hidden_states = self.ln(hidden_states) 58 | probs, indices = self.router(hidden_states) 59 | (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation( 60 | hidden_states, probs, indices 61 | ) 62 | expert_output = self.experts(dispatched_input, tokens_per_expert) 63 | output = self.token_dispatcher.token_unpermutation( 64 | expert_output, 65 | ) 66 | return output 67 | -------------------------------------------------------------------------------- /realhf/impl/model/parallelism/pipeline_parallel/p2p.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/microsoft/DeepSpeed 2 | import torch 3 | import torch.distributed as dist 4 | from packaging.version import Version 5 | 6 | import realhf.base.constants as constants 7 | 8 | ID_TO_DTYPE = [ 9 | torch.float32, 10 | torch.float64, 11 | torch.complex64, 12 | torch.complex128, 13 | torch.float16, 14 | torch.bfloat16, 15 | torch.uint8, 16 | torch.int8, 17 | torch.int16, 18 | torch.int32, 19 | torch.int64, 20 | torch.bool, 21 | ] 22 | DTYPE_TO_ID = {dtype: id_ for id_, dtype in enumerate(ID_TO_DTYPE)} 23 | 24 | 25 | def _tensor_bytes(tensor): 26 | return tensor.numel() * tensor.element_size() 27 | 28 | 29 | def can_send_recv() -> bool: 30 | # torch_version = Version(torch_info["version"]) 31 | torch_version = Version(torch.__version__) 32 | sendrecv_min = Version("1.8") 33 | return torch_version >= sendrecv_min 34 | 35 | 36 | assert can_send_recv() 37 | 38 | 39 | def _is_valid_send_recv(src_stage, dest_stage): 40 | first_stage = 0 41 | last_stage = constants.grid().pipe_parallel_size - 1 42 | assert ( 43 | abs(src_stage - dest_stage) == 1 44 | or (src_stage == first_stage and dest_stage == last_stage) 45 | or (src_stage == last_stage and dest_stage == first_stage) 46 | ), f"Functionality currently limited to send and receive between adjacent ranks only (src={src_stage}, dst={dest_stage})" 47 | 48 | 49 | def send(tensor, dest_stage, async_op=False): 50 | # NOTE: The input is the stage id rather than the global rank 51 | src_stage = constants.grid().get_stage_id() 52 | _is_valid_send_recv(src_stage, dest_stage) 53 | 54 | dest_rank = constants.grid().stage_to_global(stage_id=dest_stage) 55 | send_method = dist.isend if async_op else dist.send 56 | return send_method(tensor, constants.to_global_pg_rank(dest_rank)) 57 | 58 | 59 | def recv(tensor, src_stage, async_op=False): 60 | # NOTE: The input is the stage id rather than the global rank 61 | dest_stage = constants.grid().get_stage_id() 62 | _is_valid_send_recv(src_stage, dest_stage) 63 | 64 | src_rank = constants.grid().stage_to_global(stage_id=src_stage) 65 | recv_method = dist.irecv if async_op else dist.recv 66 | return recv_method(tensor, constants.to_global_pg_rank(src_rank)) 67 | -------------------------------------------------------------------------------- /realhf/impl/model/parallelism/pipeline_parallel/tensor_storage.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/microsoft/DeepSpeed 2 | from collections import defaultdict 3 | from typing import Any, List, Optional, Tuple 4 | 5 | import torch 6 | 7 | import realhf.base.logging as logging 8 | import realhf.impl.model.parallelism.pipeline_parallel.p2p as p2p 9 | 10 | logger = logging.getLogger("tensor_utils") 11 | 12 | 13 | def get_shape(tensor): 14 | return tensor.shape if torch.is_tensor(tensor) else None 15 | 16 | 17 | def print_data_shapes(name, rank, mbid, x, ys): 18 | if rank == 0: 19 | logger.debug(f"{name}: rank {rank} mbid {mbid}") 20 | logger.debug( 21 | f"shapes: x.pp_input {get_shape(x.pp_input)}, x.pp_output {get_shape(x.pp_output)}," 22 | f" x.cu_seqlens {get_shape(x.cu_seqlens)}" 23 | ) 24 | for i, y in enumerate(ys): 25 | logger.debug( 26 | f"shapes: ys[{i}].input_ids {get_shape(y.packed_input_ids)}, " 27 | f"ys[{i}].k_cache {get_shape(y.k_cache)}, ys[{i}].v_cache {get_shape(y.v_cache)}, " 28 | f"ys[{i}].cache_seqlens {get_shape(y.cache_seqlens)}" 29 | ) 30 | 31 | 32 | class TensorBuffer: 33 | # could store both tensors and other data 34 | 35 | def __init__(self): 36 | self.tensors = defaultdict(dict) 37 | 38 | def put(self, name: str, mbid: int, x: torch.Tensor): 39 | self.tensors[name][mbid] = x 40 | 41 | def alloc( 42 | self, 43 | name: str, 44 | mbid: int, 45 | shape: Tuple[int], 46 | dtype: torch.dtype, 47 | device: torch.device, 48 | require_grads: bool = False, 49 | ): 50 | self.tensors[name][mbid] = torch.zeros( 51 | shape, dtype=dtype, device=device, requires_grad=require_grads 52 | ) 53 | return self.tensors[name][mbid] 54 | 55 | def get( 56 | self, 57 | name: str, 58 | mbid: int, 59 | remove: bool = False, 60 | raise_error: bool = True, 61 | ): 62 | try: 63 | if remove: 64 | return self.tensors[name].pop(mbid) 65 | else: 66 | return self.tensors[name][mbid] 67 | except KeyError as e: 68 | if raise_error: 69 | raise e 70 | else: 71 | return None 72 | 73 | def remove(self, name: str, mbid: Optional[int] = None, check_exists: bool = False): 74 | try: 75 | if mbid is None: 76 | del self.tensors[name] 77 | else: 78 | self.tensors[name].pop(mbid) 79 | except KeyError: 80 | if not check_exists: 81 | return 82 | raise KeyError(f"TensorBuffer.remove: key {name} mbid {mbid} not found") 83 | 84 | def check_name(self, name: str): 85 | return name in self.tensors 86 | 87 | def check_mbid(self, name: str, mbid: int): 88 | if name not in self.tensors: 89 | return False 90 | return mbid in self.tensors[name] 91 | 92 | def clear(self): 93 | self.tensors = defaultdict(dict) 94 | -------------------------------------------------------------------------------- /realhf/impl/model/utils/dpo_functional.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def dpo_loss( 8 | pi_logps: torch.Tensor, 9 | ref_logps: torch.Tensor, 10 | beta: float, 11 | ): 12 | assert len(pi_logps.shape) == 1 and pi_logps.shape[0] % 2 == 0, ( 13 | pi_logps.shape, 14 | ref_logps.shape, 15 | ) 16 | assert len(ref_logps.shape) == 1 and ref_logps.shape[0] % 2 == 0, ( 17 | pi_logps.shape, 18 | ref_logps.shape, 19 | ) 20 | pi_logps = pi_logps.view(-1, 2) 21 | ref_logps = ref_logps.view(-1, 2) 22 | pi_yw_logps, pi_yl_logps = pi_logps[:, 0], pi_logps[:, 1] 23 | ref_yw_logps, ref_yl_logps = ref_logps[:, 0], ref_logps[:, 1] 24 | pi_logratios = pi_yw_logps - pi_yl_logps 25 | ref_logratios = ref_yw_logps - ref_yl_logps 26 | losses = -F.logsigmoid(beta * (pi_logratios - ref_logratios)).mean() 27 | pos_score = beta * (pi_yw_logps - ref_yw_logps).detach().sum() 28 | neg_score = beta * (pi_yl_logps - ref_yl_logps).detach().sum() 29 | kl = -(pi_logps - ref_logps).detach().sum() 30 | return losses, pos_score, neg_score, kl 31 | -------------------------------------------------------------------------------- /realhf/search_engine/__init__.py: -------------------------------------------------------------------------------- 1 | def import_profiler_registers(): 2 | import realhf.search_engine.enumerate 3 | import realhf.search_engine.estimate 4 | import realhf.search_engine.layers 5 | import realhf.search_engine.param_realloc 6 | import realhf.search_engine.search 7 | import realhf.search_engine.utils 8 | -------------------------------------------------------------------------------- /realhf/search_engine/utils.py: -------------------------------------------------------------------------------- 1 | from realhf.api.core.model_api import ReaLModelConfig 2 | 3 | 4 | def find_factors(n): 5 | factors = [] 6 | for i in range(1, n + 1): 7 | if n % i == 0: 8 | factors.append(i) 9 | return factors 10 | 11 | 12 | def make_stats_key(rpc_name, bs, seq_len): 13 | return f"{rpc_name}|{bs}|{seq_len}" 14 | 15 | 16 | def parse_stats_key(key): 17 | rpc_name, bs, seq_len = key.split("|") 18 | return rpc_name, int(bs), int(seq_len) 19 | 20 | 21 | def load_model_config(model_class: str, model_path: str) -> ReaLModelConfig: 22 | from realhf.impl.model.nn.real_llm_api import ReaLModel 23 | 24 | return getattr(ReaLModel, f"config_from_{model_class}")(model_path=model_path) 25 | -------------------------------------------------------------------------------- /realhf/system/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import traceback 4 | from typing import Type 5 | 6 | import realhf.api.core.system_api 7 | import realhf.base.logging as logging 8 | 9 | logger = logging.getLogger("system") 10 | 11 | # NOTE: Workers are configured in the following order. 12 | # Take special care when adding a new worker type. 13 | WORKER_TYPES = ["model_worker", "master_worker"] 14 | 15 | 16 | def load_worker(worker_type: str) -> Type: 17 | assert worker_type in WORKER_TYPES, f"Invalid worker type {worker_type}" 18 | module = importlib.import_module(worker_type_to_module(worker_type)) 19 | class_name = worker_type_to_class_name(worker_type) 20 | return getattr(module, class_name) 21 | 22 | 23 | def worker_type_to_module(worker_type: str): 24 | return "realhf.system." + worker_type 25 | 26 | 27 | def worker_type_to_class_name(worker_type: str): 28 | return "".join([w.capitalize() for w in worker_type.split("_")]) 29 | 30 | 31 | def run_worker( 32 | worker_type, experiment_name, trial_name, worker_name, worker_server_type 33 | ): 34 | """Run one worker 35 | Args: 36 | worker_type: string, one of the worker types listed above, 37 | experiment_name: string, the experiment this worker belongs to, 38 | trial_name: string, the specific trial this worker belongs to, 39 | worker_name: name given to the worker, typically "/" 40 | worker_server_type: string, either 'zmq' or 'ray'. 41 | """ 42 | worker_class = load_worker(worker_type) 43 | make_server_fn = getattr( 44 | importlib.import_module("realhf.system.worker_control"), "make_server" 45 | ) 46 | server = make_server_fn( 47 | type_=worker_server_type, 48 | experiment_name=experiment_name, 49 | trial_name=trial_name, 50 | worker_name=worker_name, 51 | ) 52 | worker = worker_class(server=server) 53 | try: 54 | worker.run() 55 | except Exception as e: 56 | logger.error("Worker %s failed with exception: %s", worker_name, e) 57 | logger.error(traceback.format_exc()) 58 | raise e 59 | 60 | 61 | def make_controller(type_, experiment_name, trial_name): 62 | module = importlib.import_module("realhf.system.controller") 63 | if type_ == "zmq": 64 | control_module = importlib.import_module("realhf.system.worker_control") 65 | panel = getattr(control_module, "make_control")( 66 | "zmq", experiment_name, trial_name 67 | ) 68 | return getattr(module, "Controller")(experiment_name, trial_name, panel) 69 | elif type_ == "ray": 70 | return getattr(module, "RayController")(experiment_name, trial_name) 71 | else: 72 | raise NotImplementedError(f"Unknown controller type {type_}.") 73 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-nefertiti 2 | sphinx 3 | build>=1.2.1 4 | wheel>=0.43.0 5 | distro-info>=1.0 6 | python-debian>=0.1.49 7 | huggingface_hub 8 | datasets 9 | accelerate 10 | ninja 11 | matplotlib 12 | ipython 13 | megatron_core==0.6.0 14 | deepspeed==0.14.0 15 | h5py 16 | nltk 17 | sentencepiece 18 | wandb 19 | tensorboardx 20 | blosc 21 | colorama 22 | colorlog 23 | einops 24 | hydra-core 25 | matplotlib 26 | numba 27 | omegaconf 28 | packaging 29 | pandas 30 | pybind11>=2.10.0 31 | numpy<2.0.0 32 | psutil 33 | pynvml 34 | pytest 35 | PyYAML 36 | pyzmq 37 | ray 38 | redis 39 | scipy 40 | seaborn 41 | setuptools>=61.0 42 | tqdm 43 | transformers==4.42.3 44 | networkx==3.3 45 | matplotlib 46 | tabulate 47 | aiofiles -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # NOTE: This file is required for importing. 2 | -------------------------------------------------------------------------------- /tests/cpp_extensions/test_interval_ops.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import time 4 | import uuid 5 | from typing import * 6 | 7 | import numpy as np 8 | import pytest 9 | import torch 10 | 11 | from realhf.impl.model.nn.flatten_param import ( 12 | _set_intervals_py, 13 | _slice_intervals_py, 14 | set_intervals, 15 | slice_intervals, 16 | ) 17 | 18 | 19 | def make_intervals(maxsize, n_intervals): 20 | assert maxsize // n_intervals > 1 21 | s = maxsize // n_intervals 22 | intervals = [] 23 | interval_size = 0 24 | max_interval_size = 0 25 | for i in range(n_intervals): 26 | intervals.append((i * s, i * s + s // 2)) 27 | interval_size += s // 2 28 | max_interval_size = max(max_interval_size, s // 2) 29 | np.random.shuffle(intervals) 30 | return np.array(intervals, dtype=np.int64), interval_size, max_interval_size 31 | 32 | 33 | def maybe_synchronize_cuda(): 34 | if torch.cuda.is_available(): 35 | torch.cuda.synchronize() 36 | 37 | 38 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires a GPU.") 39 | @pytest.mark.parametrize( 40 | "n_intervals", list(reversed([1, 100, 500, 1000, 2000, 4000, 10000, 100000])) 41 | ) 42 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32, torch.float16]) 43 | def test_get(n_intervals: int, dtype: torch.dtype): 44 | device = torch.device("cuda") 45 | 46 | input_tensor = torch.randn(int(1e8), device=device, dtype=dtype) 47 | intervals, output_size, max_interval_size = make_intervals( 48 | input_tensor.size(0), n_intervals 49 | ) 50 | intervals_cuda = torch.tensor(intervals, dtype=torch.long, device="cuda") 51 | 52 | # warmup 53 | slice_intervals( 54 | input_tensor, 55 | intervals, 56 | intervals_cuda=intervals_cuda, 57 | output_size=output_size, 58 | max_interval_size=max_interval_size, 59 | ) 60 | _slice_intervals_py(input_tensor, intervals) 61 | 62 | maybe_synchronize_cuda() 63 | tik = time.perf_counter() 64 | for _ in range(10): 65 | output_tensor = slice_intervals( 66 | input_tensor, 67 | intervals, 68 | intervals_cuda=intervals_cuda, 69 | output_size=output_size, 70 | max_interval_size=max_interval_size, 71 | ) 72 | maybe_synchronize_cuda() 73 | t1 = time.perf_counter() - tik 74 | 75 | maybe_synchronize_cuda() 76 | tik = time.perf_counter() 77 | for _ in range(10): 78 | o2 = _slice_intervals_py(input_tensor, intervals) 79 | maybe_synchronize_cuda() 80 | t2 = time.perf_counter() - tik 81 | assert torch.allclose(output_tensor, o2) 82 | print( 83 | f"slice_interval, Success! #intervals: {n_intervals} C++ ext time: {t1:.4f}, PyTorch time: {t2:.4f}" 84 | ) 85 | 86 | 87 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires a GPU.") 88 | @pytest.mark.parametrize( 89 | "n_intervals", list(reversed([1, 10, 100, 500, 1000, 1000, 10000, 100000])) 90 | ) 91 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) 92 | @pytest.mark.gpu 93 | def test_set(n_intervals: int, dtype: torch.dtype): 94 | # NOTE: Since the set_intervals degenerate to the python implementation with CPU tensors, 95 | # We don't need to test it with CPU tensors. 96 | 97 | x = torch.randn(int(1e8), device="cuda", dtype=dtype) 98 | intervals, interval_size, max_interval_size = make_intervals(x.size(0), n_intervals) 99 | intervals_cuda = torch.tensor(intervals, dtype=torch.long, device="cuda") 100 | src = torch.randn(interval_size, device="cuda", dtype=dtype) 101 | 102 | # warmup 103 | input_tensor1 = x.clone() 104 | set_intervals( 105 | src, 106 | input_tensor1, 107 | intervals, 108 | intervals_cuda=intervals_cuda, 109 | max_interval_size=max_interval_size, 110 | ) 111 | input_tensor2 = x.clone() 112 | _set_intervals_py(src, input_tensor2, intervals) 113 | 114 | input_tensor1 = x.clone() 115 | maybe_synchronize_cuda() 116 | tik = time.perf_counter() 117 | for _ in range(10): 118 | set_intervals( 119 | src, 120 | input_tensor1, 121 | intervals, 122 | intervals_cuda=intervals_cuda, 123 | max_interval_size=max_interval_size, 124 | ) 125 | maybe_synchronize_cuda() 126 | t1 = time.perf_counter() - tik 127 | 128 | input_tensor2 = x.clone() 129 | maybe_synchronize_cuda() 130 | tik = time.perf_counter() 131 | for _ in range(10): 132 | _set_intervals_py(src, input_tensor2, intervals) 133 | maybe_synchronize_cuda() 134 | t2 = time.perf_counter() - tik 135 | 136 | assert torch.allclose(input_tensor1, input_tensor2) 137 | print( 138 | f"set_interval, Success! #intervals: {n_intervals}, C++ ext time: {t1:.4f}, PyTorch time: {t2:.4f}" 139 | ) 140 | -------------------------------------------------------------------------------- /tests/model/test_cpu_inference.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import * 3 | 4 | import pytest 5 | import torch 6 | import torch.distributed as dist 7 | import transformers 8 | 9 | from realhf.base import constants, logging, testing 10 | from realhf.impl.model.nn.real_llm_api import add_helper_functions 11 | 12 | logger = logging.getLogger("tests.test_cpu") 13 | 14 | 15 | # NOTE: To run test for a new model class, please implement and register `real_config_maker` 16 | # in realhf.api.from_hf. and add the model class name to the 17 | # `model_class` fixture in this file. 18 | @pytest.fixture(params=["llama", "gpt2", "qwen2", "gemma", "mistral", "mixtral"]) 19 | def model_class(request): 20 | return request.param 21 | 22 | 23 | def maybe_prepare_cpu_env(max_prompt_len: int): 24 | if not dist.is_initialized(): 25 | # for parametrized runs 26 | dist.init_process_group( 27 | "gloo", rank=0, world_size=1, init_method="tcp://localhost:7777" 28 | ) 29 | import deepspeed 30 | 31 | deepspeed.init_distributed() 32 | testing.init_global_constants( 33 | num_dp=1, 34 | num_mp=1, 35 | num_pp=1, 36 | sequence_parallel=False, 37 | max_prompt_len=max_prompt_len, 38 | ) 39 | assert dist.get_world_size() == 1, dist.get_world_size() 40 | 41 | 42 | @pytest.fixture 43 | def mconfig(model_class): 44 | from realhf.impl.model.nn.real_llm_api import ReaLModel 45 | 46 | mconfig = getattr(ReaLModel, f"make_{model_class}_config")() 47 | return mconfig 48 | 49 | 50 | @pytest.fixture 51 | def save_path(tmpdir_factory: pytest.TempdirFactory): 52 | return tmpdir_factory.mktemp("save_path") 53 | 54 | 55 | @pytest.fixture 56 | def cpu_real_model(model_class, mconfig, save_path): 57 | max_prompt_len = mconfig.n_positions 58 | maybe_prepare_cpu_env(max_prompt_len) 59 | with constants.model_scope(testing.MODEL_NAME): 60 | from realhf.impl.model.nn.real_llm_api import ReaLModel 61 | 62 | model = ReaLModel(mconfig, dtype=torch.float32, device="cpu") 63 | add_helper_functions(model) 64 | model.instantiate() 65 | model.eval() 66 | getattr(model, f"to_{model_class}")(None, save_path) 67 | return model 68 | 69 | 70 | @pytest.fixture 71 | def cpu_hf_model(save_path): 72 | hf_model = transformers.AutoModelForCausalLM.from_pretrained(save_path).to( 73 | torch.float32 74 | ) 75 | hf_model.eval() 76 | return hf_model 77 | 78 | 79 | @torch.no_grad() 80 | def test_inference_cpu_consistency(cpu_real_model, cpu_hf_model, model_class, mconfig): 81 | max_prompt_len = mconfig.n_positions 82 | with constants.model_scope(testing.MODEL_NAME): 83 | bs = 10 84 | torch.manual_seed(1) 85 | input_ids = torch.randint( 86 | 0, mconfig.vocab_size, (bs, max_prompt_len), dtype=torch.long 87 | ) 88 | input_lens = torch.full((bs,), max_prompt_len, dtype=torch.int32) 89 | attention_mask = torch.arange(max_prompt_len)[None, :] < input_lens[:, None] 90 | 91 | logits1 = cpu_hf_model( 92 | input_ids=input_ids, attention_mask=attention_mask 93 | ).logits * attention_mask.unsqueeze(-1) 94 | logits2 = cpu_real_model( 95 | input_ids=input_ids, attention_mask=attention_mask 96 | ).logits * attention_mask.unsqueeze(-1) 97 | 98 | assert torch.allclose(logits1, logits2, atol=1e-4), ( 99 | model_class, 100 | (logits1 - logits2).abs().max(), 101 | ) 102 | --------------------------------------------------------------------------------