├── .clang-format
├── .dockerignore
├── .github
    └── workflows
    │   ├── format-check.yml
    │   ├── publish.yml
    │   ├── pytest.yml
    │   └── sphinx.yml
├── .gitignore
├── AUTHORS
├── CHANGES
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── assets
    └── qrcode.jpg
├── csrc
    ├── cugae
    │   └── gae.cu
    ├── custom_all_reduce
    │   ├── custom_all_reduce.cu
    │   ├── custom_all_reduce.cuh
    │   ├── custom_all_reduce_test.cu
    │   └── pybind.cpp
    ├── interval_op
    │   ├── interval_op.cpp
    │   └── interval_op.cu
    └── search
    │   ├── device_mesh.cpp
    │   ├── device_mesh.hpp
    │   ├── rpc.cpp
    │   ├── rpc.hpp
    │   ├── search.cpp
    │   ├── simulate.cpp
    │   └── simulate.hpp
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _static
    │       └── custom.css
    │   ├── arch.rst
    │   ├── conf.py
    │   ├── contributing.rst
    │   ├── customization.rst
    │   ├── distributed.rst
    │   ├── expconfig.rst
    │   ├── images
    │       ├── dfg
    │       │   ├── dpo.svg
    │       │   ├── grpo.svg
    │       │   ├── ppo.svg
    │       │   └── reinforce.svg
    │       ├── experiment_workflow.svg
    │       ├── ppo_rwd.svg
    │       ├── real_logo.svg
    │       ├── real_logo_dark.svg
    │       ├── rlhf_dfg.svg
    │       ├── rw_loss.svg
    │       ├── sft_loss.svg
    │       ├── timeline.svg
    │       └── vws.svg
    │   ├── impl.rst
    │   ├── index.rst
    │   ├── install.rst
    │   ├── intro.rst
    │   └── quickstart.rst
├── examples
    ├── cluster_config.json
    ├── customized_exp
    │   ├── ppo_ref_ema.py
    │   ├── ppo_sentiment.py
    │   └── scripts
    │   │   ├── run_ppo_ref_ema.sh
    │   │   └── run_ppo_sentiment.sh
    ├── load_and_eval_rw.py
    ├── new_algorithms
    │   ├── grpo
    │   │   ├── grpo.sh
    │   │   ├── grpo_exp.py
    │   │   └── grpo_interface.py
    │   └── reinforce
    │   │   ├── reinforce.sh
    │   │   ├── reinforce_exp.py
    │   │   └── reinforce_interface.py
    ├── profiling
    │   ├── allocations.jsonl
    │   ├── datasets.jsonl
    │   ├── interfaces.jsonl
    │   ├── models.jsonl
    │   └── profile.sh
    ├── scripts
    │   ├── distributed_ray
    │   │   ├── dpo.sh
    │   │   ├── ppo.sh
    │   │   ├── rw.sh
    │   │   └── sft.sh
    │   ├── distributed_slurm
    │   │   ├── dpo.sh
    │   │   ├── ppo.sh
    │   │   ├── rw.sh
    │   │   └── sft.sh
    │   └── local
    │   │   ├── dpo.sh
    │   │   ├── gen.sh
    │   │   ├── ppo.sh
    │   │   ├── ppo_manual.sh
    │   │   ├── ppo_minibatched.sh
    │   │   ├── ppo_symm.sh
    │   │   ├── rw.sh
    │   │   └── sft.sh
    └── visualize_dfg.py
├── pyproject.toml
├── pytest.ini
├── realhf
    ├── __init__.py
    ├── api
    │   ├── core
    │   │   ├── config.py
    │   │   ├── data_api.py
    │   │   ├── dfg.py
    │   │   ├── model_api.py
    │   │   └── system_api.py
    │   ├── from_hf
    │   │   ├── __init__.py
    │   │   ├── gemma.py
    │   │   ├── gpt2.py
    │   │   ├── llama.py
    │   │   ├── mistral.py
    │   │   ├── mixtral.py
    │   │   └── qwen2.py
    │   └── quickstart
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── device_mesh.py
    │   │   ├── entrypoint.py
    │   │   ├── model.py
    │   │   └── search.py
    ├── apps
    │   ├── __init__.py
    │   ├── main.py
    │   ├── profile_layers.py
    │   ├── quickstart.py
    │   └── remote.py
    ├── base
    │   ├── __init__.py
    │   ├── asyncio_utils.py
    │   ├── cluster.py
    │   ├── constants.py
    │   ├── datapack.py
    │   ├── gpu_utils.py
    │   ├── importing.py
    │   ├── logging.py
    │   ├── monitor.py
    │   ├── name_resolve.py
    │   ├── names.py
    │   ├── network.py
    │   ├── numpy_utils.py
    │   ├── ray_utils.py
    │   ├── recover.py
    │   ├── saveload_utils.py
    │   ├── security.py
    │   ├── seeding.py
    │   ├── slurm_utils.py
    │   ├── testing.py
    │   ├── timeutil.py
    │   └── topology.py
    ├── experiments
    │   ├── benchmark
    │   │   └── profile_exp.py
    │   └── common
    │   │   ├── check.py
    │   │   ├── common.py
    │   │   ├── dpo_exp.py
    │   │   ├── gen_exp.py
    │   │   ├── ppo_exp.py
    │   │   ├── rw_exp.py
    │   │   ├── sft_exp.py
    │   │   └── utils.py
    ├── impl
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── prompt_answer_dataset.py
    │   │   ├── prompt_dataset.py
    │   │   └── rw_paired_dataset.py
    │   └── model
    │   │   ├── __init__.py
    │   │   ├── backend
    │   │       ├── deepspeed.py
    │   │       ├── inference.py
    │   │       ├── megatron.py
    │   │       └── pipe_runner.py
    │   │   ├── comm
    │   │       ├── data_transfer.py
    │   │       ├── global_comm.py
    │   │       └── param_realloc.py
    │   │   ├── conversion
    │   │       └── hf_registry.py
    │   │   ├── interface
    │   │       ├── dpo_interface.py
    │   │       ├── gen_interface.py
    │   │       ├── ppo_interface.py
    │   │       ├── rw_interface.py
    │   │       └── sft_interface.py
    │   │   ├── modules
    │   │       ├── __init__.py
    │   │       ├── activations.py
    │   │       ├── attn.py
    │   │       ├── embedding.py
    │   │       ├── mlp.py
    │   │       ├── moe
    │   │       │   ├── __init__.py
    │   │       │   ├── experts.py
    │   │       │   ├── layer.py
    │   │       │   ├── router.py
    │   │       │   └── token_dispatcher.py
    │   │       ├── rms.py
    │   │       └── rotary.py
    │   │   ├── nn
    │   │       ├── flatten_param.py
    │   │       ├── real_llm_api.py
    │   │       ├── real_llm_base.py
    │   │       ├── real_llm_generate.py
    │   │       └── real_llm_parallel.py
    │   │   ├── parallelism
    │   │       ├── model_parallel
    │   │       │   ├── custom_all_reduce.py
    │   │       │   ├── mappings.py
    │   │       │   ├── modules.py
    │   │       │   └── utils.py
    │   │       └── pipeline_parallel
    │   │       │   ├── instruction.py
    │   │       │   ├── p2p.py
    │   │       │   ├── static_schedule.py
    │   │       │   └── tensor_storage.py
    │   │   └── utils
    │   │       ├── cuda_graph.py
    │   │       ├── dpo_functional.py
    │   │       ├── functional.py
    │   │       ├── logits_warper.py
    │   │       ├── moe.py
    │   │       ├── padding.py
    │   │       ├── ppo_functional.py
    │   │       └── random.py
    ├── scheduler
    │   ├── client.py
    │   ├── local
    │   │   └── client.py
    │   └── slurm
    │   │   ├── client.py
    │   │   └── utils.py
    ├── search_engine
    │   ├── __init__.py
    │   ├── enumerate.py
    │   ├── estimate.py
    │   ├── layers.py
    │   ├── param_realloc.py
    │   ├── search.py
    │   └── utils.py
    └── system
    │   ├── __init__.py
    │   ├── buffer.py
    │   ├── controller.py
    │   ├── master_worker.py
    │   ├── model_worker.py
    │   ├── request_reply_stream.py
    │   ├── worker_base.py
    │   └── worker_control.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── comm
        └── test_param_realloc.py
    ├── cpp_extensions
        ├── test_cugae.py
        ├── test_grouped_gemm.py
        └── test_interval_ops.py
    ├── data
        ├── test_dfg.py
        ├── test_load_data.py
        └── test_sequence_gather_split.py
    └── model
        ├── test_cpu_inference.py
        ├── test_distributed_load_hf.py
        └── test_generate.py


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | AccessModifierOffset: -1
 4 | AlignAfterOpenBracket: Align
 5 | AlignConsecutiveAssignments: false
 6 | AlignConsecutiveDeclarations: false
 7 | AlignEscapedNewlinesLeft: true
 8 | AlignOperands:   true
 9 | AlignTrailingComments: true
10 | AllowAllParametersOfDeclarationOnNextLine: true
11 | AllowShortBlocksOnASingleLine: true
12 | AllowShortCaseLabelsOnASingleLine: true
13 | AllowShortFunctionsOnASingleLine: All
14 | AllowShortIfStatementsOnASingleLine: true
15 | AllowShortLoopsOnASingleLine: true
16 | AlwaysBreakAfterDefinitionReturnType: None
17 | AlwaysBreakAfterReturnType: None
18 | AlwaysBreakBeforeMultilineStrings: false
19 | AlwaysBreakTemplateDeclarations: true
20 | BinPackArguments: true
21 | BinPackParameters: true
22 | BraceWrapping:
23 |   AfterClass:      true
24 |   AfterControlStatement: false
25 |   AfterEnum:       false
26 |   AfterFunction:   false
27 |   AfterNamespace:  false
28 |   AfterObjCDeclaration: false
29 |   AfterStruct:     false
30 |   AfterUnion:      false
31 |   BeforeCatch:     false
32 |   BeforeElse:      false
33 |   IndentBraces:    false
34 | BreakBeforeBinaryOperators: NonAssignment
35 | BreakBeforeBraces: Attach
36 | BreakBeforeTernaryOperators: true
37 | BreakConstructorInitializersBeforeComma: false
38 | BreakAfterJavaFieldAnnotations: false
39 | BreakStringLiterals: true
40 | ColumnLimit:     100
41 | CommentPragmas:  '^ IWYU pragma:'
42 | BreakBeforeInheritanceComma: false
43 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
44 | ConstructorInitializerIndentWidth: 4
45 | ContinuationIndentWidth: 4
46 | Cpp11BracedListStyle: true
47 | DisableFormat:   false
48 | ExperimentalAutoDetectBinPacking: false
49 | FixNamespaceComments: true
50 | ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
51 | IncludeCategories:
52 |   - Regex:           '^<.*\.h>'
53 |     Priority:        1
54 |   - Regex:           '^<.*'
55 |     Priority:        2
56 |   - Regex:           '.*'
57 |     Priority:        3
58 | IncludeIsMainRegex: '([-_](test|unittest))?$'
59 | IndentCaseLabels: true
60 | IndentWidth:     2
61 | IndentWrappedFunctionNames: false
62 | JavaScriptQuotes: Leave
63 | JavaScriptWrapImports: true
64 | KeepEmptyLinesAtTheStartOfBlocks: false
65 | MacroBlockBegin: ''
66 | MacroBlockEnd:   ''
67 | MaxEmptyLinesToKeep: 1
68 | NamespaceIndentation: None
69 | ObjCBlockIndentWidth: 2
70 | ObjCSpaceAfterProperty: false
71 | ObjCSpaceBeforeProtocolList: false
72 | PenaltyBreakBeforeFirstCallParameter: 1
73 | PenaltyBreakComment: 300
74 | PenaltyBreakFirstLessLess: 120
75 | PenaltyBreakString: 1000
76 | PenaltyExcessCharacter: 1000000
77 | PenaltyReturnTypeOnItsOwnLine: 200
78 | PointerAlignment: Right
79 | ReflowComments:  true
80 | SortIncludes:    false
81 | SpaceAfterCStyleCast: false
82 | SpaceAfterTemplateKeyword: false
83 | SpaceBeforeAssignmentOperators: true
84 | SpaceBeforeParens: ControlStatements
85 | SpaceInEmptyParentheses: false
86 | SpacesBeforeTrailingComments: 2
87 | SpacesInAngles:  false
88 | SpacesInContainerLiterals: true
89 | SpacesInCStyleCastParentheses: false
90 | SpacesInParentheses: false
91 | SpacesInSquareBrackets: false
92 | Standard:        Auto
93 | TabWidth:        8
94 | UseTab:          Never
95 | ...
96 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
  1 | # Legacy codes
  2 | .legacy/
  3 | .data/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | trace_result/
 10 | profile_result/
 11 | 
 12 | slurm_outs
 13 | _data
 14 | *.nfs*
 15 | output
 16 | logs
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | # dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | cover/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | .pybuilder/
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | #   For a library or package, you might want to ignore these files since the code is
 99 | #   intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # poetry
110 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
111 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
112 | #   commonly ignored for libraries.
113 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
114 | #poetry.lock
115 | 
116 | # pdm
117 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118 | #pdm.lock
119 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
120 | #   in version control.
121 | #   https://pdm.fming.dev/#use-with-ide
122 | .pdm.toml
123 | 
124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125 | __pypackages__/
126 | 
127 | # Celery stuff
128 | celerybeat-schedule
129 | celerybeat.pid
130 | 
131 | # SageMath parsed files
132 | *.sage.py
133 | 
134 | # Environments
135 | .env
136 | .venv
137 | env/
138 | venv/
139 | ENV/
140 | env.bak/
141 | venv.bak/
142 | 
143 | # Spyder project settings
144 | .spyderproject
145 | .spyproject
146 | 
147 | # Rope project settings
148 | .ropeproject
149 | 
150 | # mkdocs documentation
151 | /site
152 | 
153 | # mypy
154 | .mypy_cache/
155 | .dmypy.json
156 | dmypy.json
157 | 
158 | # Pyre type checker
159 | .pyre/
160 | 
161 | # pytype static type analyzer
162 | .pytype/
163 | 
164 | # Cython debug symbols
165 | cython_debug/
166 | 
167 | # PyCharm
168 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
169 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
170 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
171 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
172 | #.idea/
173 | 
174 | # openai api key
175 | api_key.txt
176 | api_key.json
177 | 
178 | ./*.sh
179 | *.png
180 | *.jpg
181 | *.pdf
182 | 
183 | .vscode/


--------------------------------------------------------------------------------
/.github/workflows/format-check.yml:
--------------------------------------------------------------------------------
 1 | name: Check Formatting
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   check_formatting:
 7 |     runs-on: ubuntu-latest
 8 |     
 9 |     steps:
10 |       - name: Checkout repository
11 |         uses: actions/checkout@v3
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v4
15 |         with:
16 |           python-version: '3.10'
17 | 
18 |       - name: Install Python dependencies
19 |         run: |
20 |           python3 -m pip install --upgrade pip
21 |           pip install isort black clang-format
22 | 
23 |       - name: Check Python formatting with isort
24 |         run: isort --check-only .
25 |       
26 |       - name: Check Python formatting with black
27 |         run: black --check .
28 | 
29 |       - name: Check C++ formatting
30 |         run: |
31 |           find . -type f \( -name '*.c' -o -name '*.h' -o -name '*.cpp' -o -name '*.hpp' -o -name '*.cu' -o -name '*.cuh' \) -exec clang-format --dry-run --Werror {} +
32 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - name: Check out repository
13 |         uses: actions/checkout@v3
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v4
17 |         with:
18 |           python-version: '3.10'  # Specify the Python version you need
19 | 
20 |       - name: Install dependencies
21 |         run: |
22 |           python3 -m pip install --upgrade pip
23 |           python3 -m pip install -r requirements.txt
24 | 
25 |       - name: Build package
26 |         run: python3 -m build -n --sdist
27 | 
28 |       - name: Publish to PyPI
29 |         env:
30 |           TWINE_USERNAME: __token__
31 |           TWINE_PASSWORD: ${{ secrets.PIP_TOKEN }}
32 |         run: |
33 |           python3 -m pip install twine
34 |           twine upload dist/*


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | name: Run Pytest
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - name: Check out repository
11 |         uses: actions/checkout@v3
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v4
15 |         with:
16 |           python-version: '3.10'
17 | 
18 |       - name: Install dependencies
19 |         run: |
20 |           python3 -m pip install --upgrade pip
21 |           python3 -m pip install -r requirements.txt
22 |           python3 -m pip install pytest
23 |           python3 -m pip install torch==2.3.1
24 |           python3 -m pip install -e . --no-build-isolation
25 | 
26 |       - name: Run tests
27 |         run: |
28 |           pytest -m "not gpu"
29 | 


--------------------------------------------------------------------------------
/.github/workflows/sphinx.yml:
--------------------------------------------------------------------------------
 1 | name: "Sphinx: Render docs"
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     permissions:
 9 |       contents: write
10 |     steps:
11 |     - uses: actions/checkout@v4
12 |     - name: Build HTML
13 |       uses: garrett4wade/sphinx-action@master
14 |     - name: Upload artifacts
15 |       uses: actions/upload-artifact@v4
16 |       with:
17 |         name: html-docs
18 |         path: docs/build/html/
19 |     - name: Deploy
20 |       uses: peaceiris/actions-gh-pages@v3
21 |       if: github.ref == 'refs/heads/main'
22 |       with:
23 |         github_token: ${{ secrets.GITHUB_TOKEN }}
24 |         publish_dir: docs/build/html
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Legacy codes
  2 | .legacy/
  3 | .data/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | trace_result/
 10 | profile_result/
 11 | 
 12 | slurm_outs
 13 | _data
 14 | *.nfs*
 15 | output
 16 | logs
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | cover/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | .pybuilder/
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | #   For a library or package, you might want to ignore these files since the code is
 99 | #   intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # poetry
110 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
111 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
112 | #   commonly ignored for libraries.
113 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
114 | #poetry.lock
115 | 
116 | # pdm
117 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118 | #pdm.lock
119 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
120 | #   in version control.
121 | #   https://pdm.fming.dev/#use-with-ide
122 | .pdm.toml
123 | 
124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125 | __pypackages__/
126 | 
127 | # Celery stuff
128 | celerybeat-schedule
129 | celerybeat.pid
130 | 
131 | # SageMath parsed files
132 | *.sage.py
133 | 
134 | # Environments
135 | .env
136 | .venv
137 | env/
138 | venv/
139 | ENV/
140 | env.bak/
141 | venv.bak/
142 | 
143 | # Spyder project settings
144 | .spyderproject
145 | .spyproject
146 | 
147 | # Rope project settings
148 | .ropeproject
149 | 
150 | # mkdocs documentation
151 | /site
152 | 
153 | # mypy
154 | .mypy_cache/
155 | .dmypy.json
156 | dmypy.json
157 | 
158 | # Pyre type checker
159 | .pyre/
160 | 
161 | # pytype static type analyzer
162 | .pytype/
163 | 
164 | # Cython debug symbols
165 | cython_debug/
166 | 
167 | # PyCharm
168 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
169 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
170 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
171 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
172 | #.idea/
173 | 
174 | # openai api key
175 | api_key.txt
176 | api_key.json
177 | 
178 | ./*.sh
179 | *.png
180 | *.pdf
181 | 
182 | .vscode/


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Zhiyu Mei, meizy20@mails.tsinghua.edu.cn
2 | Wei Fu, fuwth17@gmail.com


--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/CHANGES


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG REAL_CPU_BASE_IMAGE
 2 | ARG REAL_GPU_BASE_IMAGE
 3 | 
 4 | # >>>>>> CPU image
 5 | FROM ${REAL_CPU_BASE_IMAGE} as cpu
 6 | 
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | RUN apt update
 9 | RUN apt install -y ca-certificates
10 | RUN sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list
11 | RUN sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list
12 | RUN apt update
13 | RUN apt install -y net-tools python3-pip pkg-config libopenblas-base libopenmpi-dev git
14 | 
15 | RUN pip3 install -U pip
16 | RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
17 | # Install PyTorch in advance to prevent rebuilding this large Docker layer.
18 | RUN pip3 install torch==2.3.1
19 | 
20 | COPY ./requirements.txt /requirements.txt
21 | RUN pip3 install -r /requirements.txt && rm /requirements.txt
22 | 
23 | COPY . /realhf
24 | RUN REAL_CUDA=0 pip3 install -e /realhf --no-build-isolation
25 | WORKDIR /realhf
26 | 
27 | # >>>>>> Documentation images
28 | # FROM cpu AS docs-builder
29 | # RUN pip install -U sphinx sphinx-nefertiti -i https://pypi.tuna.tsinghua.edu.cn/simple
30 | # RUN sphinx-build -M html /realhf/docs/source/ /realhf/docs/build/
31 | FROM nginx:alpine AS docs
32 | COPY ./docs/build/html /usr/share/nginx/html
33 | EXPOSE 80
34 | CMD ["nginx", "-g", "daemon off;"]
35 | 
36 | # >>>>>> GPU image
37 | FROM ${REAL_GPU_BASE_IMAGE} AS gpu
38 | 
39 | ENV DEBIAN_FRONTEND=noninteractive
40 | RUN apt update
41 | RUN apt install -y ca-certificates
42 | RUN sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list
43 | RUN sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list
44 | RUN apt update
45 | RUN apt install -y net-tools \
46 |     libibverbs-dev librdmacm-dev ibverbs-utils \
47 |     rdmacm-utils python3-pyverbs opensm ibutils perftest
48 | 
49 | RUN pip3 install -U pip
50 | RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
51 | 
52 | # set environment variables for building transformer engine
53 | ENV NVTE_WITH_USERBUFFERS=1 NVTE_FRAMEWORK=pytorch MAX_JOBS=8 MPI_HOME=/usr/local/mpi
54 | ENV PATH="${PATH}:/opt/hpcx/ompi/bin:/opt/hpcx/ucx/bin"
55 | ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/hpcx/ompi/lib:/opt/hpcx/ucx/lib/"
56 | 
57 | COPY ./requirements.txt /requirements.txt
58 | RUN pip3 install -r /requirements.txt && rm /requirements.txt
59 | 
60 | # We don't use TransformerEngine's flash-attn integration, so it's okay to disrespect dependencies
61 | RUN pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.8 --no-deps --no-build-isolation
62 | RUN pip3 install flash-attn==2.4.2 --no-build-isolation
63 | # Install grouped_gemm for MoE acceleration
64 | RUN pip3 install git+https://github.com/tgale96/grouped_gemm.git@v0.1.4 --no-build-isolation --no-deps
65 | 
66 | COPY . /realhf
67 | RUN REAL_CUDA=1 pip3 install -e /realhf --no-build-isolation
68 | WORKDIR /realhf


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include csrc *.cu *.cuh *.hpp *.cpp


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: docs
2 | 
3 | docs:
4 | 	docker compose down
5 | 	cd docs && make html
6 | 	docker compose up --build


--------------------------------------------------------------------------------
/assets/qrcode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/assets/qrcode.jpg


--------------------------------------------------------------------------------
/csrc/custom_all_reduce/pybind.cpp:
--------------------------------------------------------------------------------
 1 | /* Copied from the vLLM project: https://github.com/vllm-project/vllm */
 2 | #include <torch/extension.h>
 3 | 
 4 | using fptr_t = uint64_t;
 5 | fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
 6 |                       const std::vector<std::string> &handles, const std::vector<int64_t> &offsets,
 7 |                       int rank, bool full_nvlink);
 8 | bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, bool full_nvlink);
 9 | void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
10 | void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
11 |                       torch::Tensor &out);
12 | void dispose(fptr_t _fa);
13 | int meta_size();
14 | void register_buffer(fptr_t _fa, torch::Tensor &t, const std::vector<std::string> &handles,
15 |                      const std::vector<int64_t> &offsets);
16 | std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
17 | void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
18 |                             const std::vector<std::vector<int64_t>> &offsets);
19 | 
20 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
21 |   // vLLM custom all-reduce kernels
22 |   pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
23 |   custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
24 |   custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
25 |   custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
26 |   custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
27 |   custom_ar.def("dispose", &dispose, "dispose");
28 |   custom_ar.def("meta_size", &meta_size, "meta_size");
29 |   custom_ar.def("register_buffer", &register_buffer, "register_buffer");
30 |   custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
31 |                 "get_graph_buffer_ipc_meta");
32 |   custom_ar.def("register_graph_buffers", &register_graph_buffers, "register_graph_buffers");
33 | }
34 | 


--------------------------------------------------------------------------------
/csrc/interval_op/interval_op.cpp:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <pybind11/stl.h>
 3 | 
 4 | std::vector<std::pair<size_t, size_t>> merge_intervals(
 5 |     std::vector<std::pair<size_t, size_t>> intervals) {
 6 |   if (intervals.empty()) { return {}; }
 7 | 
 8 |   std::vector<std::pair<size_t, size_t>> merged;
 9 |   merged.push_back(intervals[0]);
10 | 
11 |   for (size_t i = 1; i < intervals.size(); ++i) {
12 |     auto &lastInterval = merged.back();
13 |     const auto &currentInterval = intervals[i];
14 | 
15 |     if (lastInterval.second == currentInterval.first) {
16 |       // Merge the intervals
17 |       lastInterval.second = currentInterval.second;
18 |     } else {
19 |       // Add the current interval as it is
20 |       merged.push_back(currentInterval);
21 |     }
22 |   }
23 | 
24 |   return merged;
25 | }
26 | 
27 | PYBIND11_MODULE(interval_op, m) {
28 |   m.def("merge_intervals", &merge_intervals, "Merge non-overlapping intervals.");
29 | }
30 | 


--------------------------------------------------------------------------------
/csrc/interval_op/interval_op.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/ParallelOpenMP.h>
  3 | #include <cuda_runtime.h>
  4 | #include <cuda_fp16.h>
  5 | 
  6 | #define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
  7 | #define CHECK_SHAPE(x, ...)                                   \
  8 |   TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
  9 |               #x " must have shape (" #__VA_ARGS__ ")")
 10 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 11 | 
 12 | template<typename T, int chunk_size>
 13 | __global__ void copyDataKernel(T *dst, const T *src, long *dst_offsets, long *src_offsets,
 14 |                                long *sizes, long N) {
 15 |   long interval_id = blockIdx.x * blockDim.x + threadIdx.x;
 16 |   if (interval_id >= N) { return; }
 17 |   long chunk_id = blockIdx.y * blockDim.y + threadIdx.y;
 18 |   long interval_size = sizes[interval_id];
 19 |   long chunk_offset = chunk_id * chunk_size;
 20 |   if (chunk_offset >= interval_size) { return; }
 21 |   long dst_offset = dst_offsets[interval_id];
 22 |   long src_offset = src_offsets[interval_id];
 23 |   long _size = interval_size - chunk_offset;
 24 |   long size = (chunk_size < _size) ? chunk_size : _size;
 25 |   memcpy(dst + dst_offset + chunk_offset, src + src_offset + chunk_offset, size * sizeof(T));
 26 | }
 27 | 
 28 | template<typename T, int chunk_size>
 29 | void set_intervals(const at::Tensor src, at::Tensor dst, const at::Tensor intervals,
 30 |                    int max_interval_size) {
 31 |   CHECK_DEVICE(src);
 32 |   CHECK_DEVICE(dst);
 33 |   CHECK_DEVICE(intervals);
 34 | 
 35 |   CHECK_CONTIGUOUS(src);
 36 |   CHECK_CONTIGUOUS(dst);
 37 |   CHECK_CONTIGUOUS(intervals);
 38 | 
 39 |   TORCH_CHECK(src.dtype() == dst.dtype(),
 40 |               "Source and destination tensors must have the same dtype");
 41 | 
 42 |   TORCH_CHECK(intervals.dtype() == torch::kLong, "intervals must be of type long");
 43 | 
 44 |   long N = intervals.size(0);
 45 |   CHECK_SHAPE(intervals, N, 2);
 46 | 
 47 |   at::Tensor interval_sizes = intervals.select(1, 1) - intervals.select(1, 0);
 48 |   at::Tensor dst_offsets = intervals.select(1, 0).contiguous();
 49 |   at::Tensor src_offsets = interval_sizes.cumsum(0, at::kLong) - interval_sizes;
 50 | 
 51 |   // Launch CUDA kernel
 52 |   const int threads_per_block_x = 32;
 53 |   const int threads_per_block_y = 32;
 54 | 
 55 |   const int num_blocks_x = (N + threads_per_block_x - 1) / threads_per_block_x;
 56 | 
 57 |   const int n_chunks = (max_interval_size + chunk_size - 1) / chunk_size;
 58 |   const int num_blocks_y = (n_chunks + threads_per_block_y - 1) / threads_per_block_y;
 59 | 
 60 |   const dim3 numBlocks(num_blocks_x, num_blocks_y);
 61 |   const dim3 threadsPerBlock(threads_per_block_x, threads_per_block_y);
 62 | 
 63 |   copyDataKernel<T, chunk_size><<<numBlocks, threadsPerBlock>>>(
 64 |       dst.data_ptr<T>(), src.data_ptr<T>(), dst_offsets.data_ptr<long>(),
 65 |       src_offsets.data_ptr<long>(), interval_sizes.data_ptr<long>(), N);
 66 | }
 67 | 
 68 | template<typename T, int chunk_size>
 69 | at::Tensor slice_intervals(const at::Tensor src, const at::Tensor intervals, long total_size,
 70 |                            int max_interval_size) {
 71 |   CHECK_DEVICE(src);
 72 |   CHECK_DEVICE(intervals);
 73 | 
 74 |   CHECK_CONTIGUOUS(src);
 75 |   CHECK_CONTIGUOUS(intervals);
 76 | 
 77 |   TORCH_CHECK(intervals.dtype() == torch::kLong, "intervals must be of type long");
 78 | 
 79 |   long N = intervals.size(0);
 80 |   CHECK_SHAPE(intervals, N, 2);
 81 | 
 82 |   at::Tensor dst = at::empty({total_size}, src.options());
 83 | 
 84 |   at::Tensor interval_sizes = intervals.select(1, 1) - intervals.select(1, 0);
 85 |   at::Tensor src_offsets = intervals.select(1, 0).contiguous();
 86 |   at::Tensor dst_offsets = interval_sizes.cumsum(0, at::kLong) - interval_sizes;
 87 | 
 88 |   // Launch CUDA kernel
 89 |   const int threads_per_block_x = 32;
 90 |   const int threads_per_block_y = 32;
 91 | 
 92 |   const int num_blocks_x = (N + threads_per_block_x - 1) / threads_per_block_x;
 93 | 
 94 |   const int n_chunks = (max_interval_size + chunk_size - 1) / chunk_size;
 95 |   const int num_blocks_y = (n_chunks + threads_per_block_y - 1) / threads_per_block_y;
 96 | 
 97 |   const dim3 numBlocks(num_blocks_x, num_blocks_y);
 98 |   const dim3 threadsPerBlock(threads_per_block_x, threads_per_block_y);
 99 | 
100 |   copyDataKernel<T, chunk_size><<<numBlocks, threadsPerBlock>>>(
101 |       dst.data_ptr<T>(), src.data_ptr<T>(), dst_offsets.data_ptr<long>(),
102 |       src_offsets.data_ptr<long>(), interval_sizes.data_ptr<long>(), N);
103 |   return dst;
104 | }
105 | 
106 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
107 |   m.def("set_intervals_fp32", &set_intervals<float, 2048>, "Set intervals of a 1D tensor");
108 |   m.def("set_intervals_fp16", &set_intervals<at::Half, 2048>, "Set intervals of a 1D tensor");
109 |   m.def("set_intervals_bf16", &set_intervals<at::BFloat16, 2048>, "Set intervals of a 1D tensor");
110 |   m.def("slice_intervals_fp32", &slice_intervals<float, 2048>, "slice intervals of a 1D tensor");
111 |   m.def("slice_intervals_fp16", &slice_intervals<at::Half, 2048>, "slice intervals of a 1D tensor");
112 |   m.def("slice_intervals_bf16", &slice_intervals<at::BFloat16, 2048>,
113 |         "slice intervals of a 1D tensor");
114 | }
115 | 


--------------------------------------------------------------------------------
/csrc/search/device_mesh.cpp:
--------------------------------------------------------------------------------
 1 | #include <device_mesh.hpp>
 2 | #include <cassert>
 3 | #include <iostream>
 4 | 
 5 | // DeviceMesh::DeviceMesh()
 6 | // : device_mesh_name(""), n_nodes(0), n_gpus(0), node_names({}), gpu_ids({}) {
 7 | // };
 8 | 
 9 | DeviceMesh::DeviceMesh(int n_nodes, int n_gpus_per_node, std::vector<std::vector<int>> mapping,
10 |                        std::string global_mesh_name, std::string name)
11 |     : n_nodes(n_nodes),
12 |       n_gpus_per_node(n_gpus_per_node),
13 |       mapping(mapping),
14 |       global_mesh_name(global_mesh_name),
15 |       name(name) {
16 |   assert(n_nodes == static_cast<int>(mapping.size()));
17 |   for (int i = 0; i < n_nodes; i++) {
18 |     assert(n_gpus_per_node == static_cast<int>(mapping[i].size()));
19 |   }
20 | };
21 | 
22 | bool is_all_overlap(std::vector<DeviceMesh *> device_meshes, DeviceMesh device_mesh) {
23 |   for (DeviceMesh *other : device_meshes) {
24 |     if (!device_mesh.overlap(*other)) return false;
25 |   }
26 |   return true;
27 | };
28 | 
29 | bool is_all_overlap(std::unordered_set<DeviceMesh *> device_meshes, DeviceMesh device_mesh) {
30 |   for (DeviceMesh *other : device_meshes) {
31 |     if (!device_mesh.overlap(*other)) return false;
32 |   }
33 |   return true;
34 | };
35 | 
36 | bool DeviceMesh::contain(const DeviceMesh &other) {
37 |   // check whether one device mapping is contained by another by
38 |   // checking 1. whether global_mesh_name is identical
39 |   // 2. whether mapping of one device mesh is contained by the other one
40 |   if (global_mesh_name != other.global_mesh_name) return false;
41 |   for (int i = 0; i < n_nodes; i++) {
42 |     for (int j = 0; j < n_gpus_per_node; j++) {
43 |       if (mapping[i][j] == 0 && other.mapping[i][j] == 1) return false;
44 |     }
45 |   }
46 |   return true;
47 | };
48 | 
49 | bool DeviceMesh::contained_by(const DeviceMesh &other) {
50 |   if (global_mesh_name != other.global_mesh_name) return false;
51 |   for (int i = 0; i < n_nodes; i++) {
52 |     for (int j = 0; j < n_gpus_per_node; j++) {
53 |       if (mapping[i][j] == 1 && other.mapping[i][j] == 0) return false;
54 |     }
55 |   }
56 |   return true;
57 | };
58 | 
59 | bool DeviceMesh::overlap(const DeviceMesh &other) {
60 |   if (global_mesh_name != other.global_mesh_name) return false;
61 |   for (int i = 0; i < n_nodes; i++) {
62 |     for (int j = 0; j < n_gpus_per_node; j++) {
63 |       if (mapping[i][j] == 1 && other.mapping[i][j] == 1) return true;
64 |     }
65 |   }
66 |   return false;
67 | };
68 | 
69 | ModelParallelStrategy::ModelParallelStrategy(int num_pp, int num_dp, int num_mp)
70 |     : num_pp(num_pp), num_dp(num_dp), num_mp(num_mp) {};
71 | 
72 | bool ModelParallelStrategy::operator==(const ModelParallelStrategy &other) const {
73 |   return num_pp == other.num_pp && num_dp == other.num_dp && num_mp == other.num_mp;
74 | };
75 | 
76 | bool DeviceMesh::operator==(const DeviceMesh &other) const {
77 |   return name == other.name && global_mesh_name == other.global_mesh_name;
78 | };
79 | 
80 | std::string ModelParallelStrategy::to_string() {
81 |   return "num_pp:" + std::to_string(num_pp) + ";" + "num_dp:" + std::to_string(num_dp) + ";"
82 |          + "num_mp:" + std::to_string(num_mp);
83 | };
84 | 
85 | std::string ModelParallelStrategy::to_key() {
86 |   return std::to_string(num_pp) + "," + std::to_string(num_mp) + "," + std::to_string(num_dp);
87 | }


--------------------------------------------------------------------------------
/csrc/search/device_mesh.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEVICE_MESH_HPP
 2 | #define DEVICE_MESH_HPP
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <unordered_map>
 7 | #include <unordered_set>
 8 | // #include <rpc.hpp>
 9 | 
10 | class RPCInstance;
11 | 
12 | class DeviceMesh {
13 |  public:
14 |   int n_nodes;
15 |   int n_gpus_per_node;
16 |   std::vector<std::vector<int>> mapping;
17 |   std::string global_mesh_name;
18 |   std::string name;
19 |   RPCInstance *pre_task = nullptr;
20 | 
21 |   // DeviceMesh();
22 |   DeviceMesh(int n_nodes, int n_gpus_per_node, std::vector<std::vector<int>> mapping,
23 |              std::string global_mesh_name, std::string name);
24 | 
25 |   bool overlap(const DeviceMesh &other);
26 |   bool contain(const DeviceMesh &other);
27 |   bool contained_by(const DeviceMesh &other);
28 | 
29 |   bool operator==(const DeviceMesh &other) const;
30 | };
31 | 
32 | bool is_all_overlap(std::vector<DeviceMesh *> device_meshes, DeviceMesh device_mesh);
33 | bool is_all_overlap(std::unordered_set<DeviceMesh *> device_meshes, DeviceMesh device_mesh);
34 | 
35 | class ModelParallelStrategy {
36 |  public:
37 |   int num_pp, num_dp, num_mp;
38 | 
39 |   ModelParallelStrategy(int num_pp, int num_dp, int num_mp);
40 | 
41 |   bool operator==(const ModelParallelStrategy &other) const;
42 | 
43 |   std::string to_string();
44 |   std::string to_key();
45 | };
46 | 
47 | class ModelDeviceMapping {};
48 | 
49 | #endif  // DEVICE_MESH_HPP


--------------------------------------------------------------------------------
/csrc/search/rpc.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef RPC_HPP
  2 | #define RPC_HPP
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include <unordered_map>
  7 | #include <device_mesh.hpp>
  8 | 
  9 | class CommStats {
 10 |  public:
 11 |   uint64_t local_send, local_recv, remote_send, remote_recv, offload_store, offload_load;
 12 | 
 13 |   CommStats(uint64_t local_send, uint64_t local_recv, uint64_t remote_send, uint64_t remote_recv,
 14 |             uint64_t offload_store, uint64_t offload_load);
 15 | };
 16 | 
 17 | class RPC {
 18 |  public:
 19 |   std::string model_name;
 20 |   std::string rpc_name;
 21 |   // interface_type: 0=generate, 1=train_step, 2=inference
 22 |   std::string interface_type;
 23 | 
 24 |   RPC(std::string model_name, std::string rpc_name, std::string interface_type);
 25 | };
 26 | 
 27 | class RPCExecution {
 28 |  public:
 29 |   RPC *rpc_ptr;
 30 |   DeviceMesh &device_mesh;
 31 |   ModelParallelStrategy &model_parallel_strategy;
 32 |   uint64_t time_cost, mem, static_mem;
 33 | 
 34 |   RPCExecution(RPC *rpc_ptr, DeviceMesh &device_mesh,
 35 |                ModelParallelStrategy &model_parallel_strategy, uint64_t time_cost, uint64_t mem,
 36 |                uint64_t static_mem);
 37 | 
 38 |   std::string to_string();
 39 | };
 40 | 
 41 | class OverlapGroup {
 42 |  public:
 43 |   std::unordered_set<RPCExecution *> rpc_executions;
 44 |   std::unordered_set<DeviceMesh *> device_meshes;
 45 |   uint64_t mem_static;
 46 |   uint64_t mem_active;
 47 | 
 48 |   bool maybe_add(RPCExecution *rpc_exe);
 49 | };
 50 | 
 51 | class DeviceMeshGroup {
 52 |  public:
 53 |   // std::string device_mesh_name;
 54 |   std::vector<OverlapGroup *> overlap_groups;
 55 | 
 56 |   void add_to_groups(RPCExecution *rpc_exe);
 57 | };
 58 | 
 59 | class GroupedRPCExecutions {
 60 |  public:
 61 |   // std::unordered_map<std::string, DeviceMeshGroup> dn_to_group;
 62 |   DeviceMeshGroup group;
 63 | 
 64 |   void add(RPCExecution *rpc_exe);
 65 |   void resolve(RPCExecution *rpc_exe);
 66 |   void offload(std::string model_name);
 67 |   uint64_t total_mem_cost();
 68 | };
 69 | 
 70 | class RPCInstance {
 71 |  public:
 72 |   RPC *rpc_ptr;
 73 |   int id;
 74 |   std::string name;
 75 |   std::vector<RPCInstance *> children;
 76 |   std::vector<RPCInstance *> parents;
 77 |   std::vector<RPCInstance *> tmp_children;
 78 |   std::vector<RPCInstance *> tmp_parents;
 79 |   std::vector<RPCInstance *> tmp_ris;    // pointers to tmp rpc instances
 80 |   std::vector<RPCExecution *> tmp_exes;  // pointers to tmp rpc executions
 81 | 
 82 |   RPCExecution *rpc_exe_ptr = nullptr;
 83 |   RPCExecution *param_sync_rpc_exe_ptr = nullptr;
 84 |   bool param_sync = false;
 85 |   uint64_t param_sync_size = 0;
 86 |   bool offload = false;
 87 |   uint64_t offload_size = 0;
 88 | 
 89 |   RPCInstance(RPC *rpc_ptr, int id, std::string name);
 90 | 
 91 |   uint64_t ready_time = 0, start_time = 0, end_time = 0;
 92 | 
 93 |   void remove_parent(RPCInstance *parent);
 94 |   void remove_child(RPCInstance *child);
 95 |   void add_parent(RPCInstance *parent);
 96 |   void add_child(RPCInstance *child);
 97 | 
 98 |   void add_tmp_parent(RPCInstance *parent);
 99 |   void add_tmp_child(RPCInstance *child);
100 |   void remove_tmp_parent(RPCInstance *parent);
101 |   void remove_tmp_child(RPCInstance *child);
102 | 
103 |   void resolve_parameter_sync(std::vector<RPCInstance *> tmp_graph,
104 |                               std::unordered_map<std::string, uint64_t> &cost_table);
105 |   // void resolve_offload(std::vector<RPCInstance*> tmp_graph,
106 |   //                      CommStats& comm_stats);
107 | };
108 | 
109 | uint64_t parameter_sync_cost(uint64_t param_size_bytes, RPCExecution *src, RPCExecution *dst,
110 |                              std::unordered_map<std::string, uint64_t> &cost_table);
111 | 
112 | uint64_t remote_param_sync_size(uint64_t size, RPCExecution *src, RPCExecution *dst);
113 | 
114 | // class ModelConfig {
115 | //     std::string model_name;
116 | //     uint64_t param_size_bytes;
117 | 
118 | //     ModelConfig(std::string model_name, uint64_t param_size_bytes);
119 | // };
120 | 
121 | #endif


--------------------------------------------------------------------------------
/csrc/search/simulate.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SIMULATE_HPP
 2 | #define SIMULATE_HPP
 3 | 
 4 | #include <rpc.hpp>
 5 | #include <device_mesh.hpp>
 6 | #include <queue>
 7 | #include <iostream>
 8 | 
 9 | class SimulateResult {
10 |  public:
11 |   uint64_t end_time;
12 |   bool oom;
13 |   uint64_t mem_cost;
14 |   std::vector<int> index;
15 |   std::vector<RPCExecution *> rpc_exe_list;
16 |   double used_time = 0;
17 | 
18 |   SimulateResult();
19 | 
20 |   SimulateResult(uint64_t end_time, bool oom, uint64_t mem_cost, std::vector<int> &index);
21 | 
22 |   SimulateResult &operator=(const SimulateResult &other);
23 | };
24 | 
25 | SimulateResult simulate(
26 |     std::vector<RPCInstance *> &graph, std::unordered_map<std::string, uint64_t> &cost_table,
27 |     std::unordered_map<std::string, uint64_t> &model_sizes,
28 |     std::unordered_map<std::string, RPC *> &rpc_table,
29 |     std::unordered_map<std::string, std::vector<RPCExecution *>> &rpc_exe_table,
30 |     std::unordered_map<std::string, std::vector<RPCInstance *>> &ri_table,
31 |     std::unordered_map<std::string, std::vector<RPCInstance *>> &model_name_ri_table,
32 |     std::vector<std::string> &sorted_rpc_names, std::vector<int> &index);
33 | 
34 | // Comparator for priority queue
35 | struct CompareEndTime {
36 |   bool operator()(SimulateResult const &r1, SimulateResult const &r2) {
37 |     // We want largest end_time at the top of the queue, so we reverse the comparison
38 |     return r1.end_time < r2.end_time;
39 |   }
40 | };
41 | 
42 | class MinEndTimeQueue {
43 |  public:
44 |   MinEndTimeQueue(int capacity) : k(capacity) {}
45 | 
46 |   void insert(SimulateResult r) {
47 |     if (queue.size() < k) {
48 |       // std::cout << "push " << "end_time: " << r.end_time << " qsize " << queue.size() <<
49 |       // std::endl;
50 |       queue.push(r);
51 |     } else if (r.end_time < queue.top().end_time) {
52 |       // std::cout << "push " << "end_time: " << r.end_time << " qsize " << queue.size() <<
53 |       // std::endl;
54 |       queue.pop();
55 |       queue.push(r);
56 |     }
57 |   }
58 | 
59 |   std::priority_queue<SimulateResult, std::vector<SimulateResult>, CompareEndTime> &getQueue() {
60 |     return queue;
61 |   }
62 | 
63 |  private:
64 |   std::priority_queue<SimulateResult, std::vector<SimulateResult>, CompareEndTime> queue;
65 |   int k;
66 | };
67 | 
68 | void mergeMinEndTimeQueues(MinEndTimeQueue &target, MinEndTimeQueue &q1);
69 | 
70 | class CompareReadyTime {
71 |  public:
72 |   bool operator()(RPCInstance *r1, RPCInstance *r2) { return r1->ready_time > r2->ready_time; }
73 | };
74 | 
75 | #endif  // SIMULATE_HPP


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   web:
 4 |     build:
 5 |       context: .
 6 |       dockerfile: Dockerfile
 7 |       target: docs
 8 |       args:
 9 |         REAL_CPU_BASE_IMAGE: ubuntu:22.04
10 |         REAL_GPU_BASE_IMAGE: nvcr.io/nvidia/pytorch:23.10-py3
11 |     ports:
12 |       - "7780:80"


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
1 | table {
2 |     width: 100%;
3 |     border-collapse: collapse;
4 | }
5 | 
6 | table th, table td {
7 |     text-align: center;
8 |     vertical-align: middle;
9 | }


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here.
11 | import pathlib
12 | import sys
13 | 
14 | sys.path.insert(0, (pathlib.Path(__file__).parents[2] / "realhf").resolve().as_posix())
15 | 
16 | project = "ReaL"
17 | copyright = "2024, Wei Fu & Zhiyu Mei"
18 | author = "Wei Fu & Zhiyu Mei"
19 | release = "0.3.0"
20 | 
21 | # -- General configuration ---------------------------------------------------
22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
23 | 
24 | extensions = []
25 | 
26 | templates_path = ["_templates"]
27 | exclude_patterns = []
28 | 
29 | # -- Options for HTML output -------------------------------------------------
30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
31 | 
32 | html_theme = "sphinx_nefertiti"
33 | html_static_path = ["_static"]
34 | 
35 | extensions = [
36 |     "sphinx.ext.duration",
37 |     "sphinx.ext.doctest",
38 |     "sphinx.ext.autodoc",
39 |     "sphinx.ext.autosummary",
40 |     "sphinx.ext.githubpages",
41 | ]
42 | 
43 | 
44 | def setup(app):
45 |     app.add_css_file("custom.css")
46 | 


--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
 1 | ##############
 2 |  Contributing
 3 | ##############
 4 | 
 5 | ..
 6 |    This repository is developed and maintained by `Wei Fu <garrett4wade.github.io>`_
 7 | 
 8 | ..
 9 |    and `Zhiyu Mei <https://openreview.net/profile?id=~Zhiyu_Mei1>`_, both of whom are
10 | 
11 | ..
12 |    PhD students at `IIIS, Tsinghua University <https://iiis.tsinghua.edu.cn/en/>`_
13 | 
14 | ..
15 |    advised by Professor `Yi Wu <https://jxwuyi.weebly.com/>`_.
16 | 
17 | ..
18 |    We acknowledge that due to limited time and resources,
19 | 
20 | ..
21 |    the quality of the documentation and code in this repository is not very high.
22 | 
23 | ..
24 |    As a result, it can be quite challenging for potential developers to
25 | 
26 | ..
27 |    read the code and contribute new features.
28 | 
29 | If you wish to contribute to this repository or have any questions about
30 | the code, please do not hesitate to raise issues or contact us directly.
31 | We will do our best to assist you. Currently, there is no template for
32 | issues or pull requests.
33 | 
34 | We hope the open-source community can help improve this repository and
35 | enable RLHF technology to truly empower the applications of LLM.
36 | 
37 | ***************
38 |  Documentation
39 | ***************
40 | 
41 | The source code is documented using Sphinx in the ``docs`` folder. On a
42 | node with docker-compose installed, run
43 | 
44 | .. code:: bash
45 | 
46 |    make docs
47 | 
48 | Then the documentation will be available at ``http://localhost:7780``.
49 | 
50 | Every time the documentation files are changed, you should run the above
51 | command to update the documentation.
52 | 
53 | The GitHub Pages will be updated automatically after the PR is merged.
54 | 
55 | ************
56 |  Formatting
57 | ************
58 | 
59 | .. code:: bash
60 | 
61 |    # For .py files
62 |    docformatter -i ${FILE} && isort ${FILE} && black -q ${FILE}
63 |    # For C/C++ files
64 |    clang-format -i ${FILE}
65 |    # For documentation
66 |    rstfmt docs
67 | 
68 | *********
69 |  Testing
70 | *********
71 | 
72 | .. code:: bash
73 | 
74 |    # Run CPU tests
75 |    pytest -m "not gpu"
76 |    # Run CPU tests and GPU tests that require a single GPU
77 |    pytest -m "not distributed"
78 |    # On a node with multiple GPUs, run all tests
79 |    pytest
80 | 
81 | ************************
82 |  Building Docker Images
83 | ************************
84 | 
85 | .. code:: bash
86 | 
87 |    # Build the GPU image
88 |    docker build -t real-gpu:24.03-0.3.0 -f Dockerfile --target gpu --build-arg REAL_GPU_BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3 --build-arg REAL_CPU_BASE_IMAGE=ubuntu:22.04 .
89 |    # Build the CPU image
90 |    docker build -t real-cpu:22.04-0.3.0 -f Dockerfile --target cpu --build-arg REAL_GPU_BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3 --build-arg REAL_CPU_BASE_IMAGE=ubuntu:22.04 .
91 | 


--------------------------------------------------------------------------------
/docs/source/expconfig.rst:
--------------------------------------------------------------------------------
  1 | ################
  2 |  Configurations
  3 | ################
  4 | 
  5 | .. note::
  6 | 
  7 |    This page serves as a reference manual for the configuration objects,
  8 |    i.e., you can check which attributes can be modified and their
  9 |    default values. You don't need to read through this page before
 10 |    running experiments!
 11 | 
 12 |    Please check the :doc:`quickstart` and :doc:`customization` sections
 13 |    for concrete examples of running experiments.
 14 | 
 15 | We illustrate configurations for quickstart experiments in this page.
 16 | Each type of experiment (e.g., SFT, PPO) corresponds to a specific
 17 | configuration object (e.g., :class:`realhf.SFTConfig` for SFT).
 18 | 
 19 | Since ReaL uses `Hydra <https://hydra.cc/>`_ for configuration
 20 | management, users can override these options provided by the class
 21 | recursively with command line arguments.
 22 | 
 23 | .. currentmodule:: realhf
 24 | 
 25 | ***************************
 26 |  Experiment Configurations
 27 | ***************************
 28 | 
 29 | .. autoclass:: ExperimentSaveEvalControl
 30 | 
 31 | .. autoclass:: CommonExperimentConfig
 32 | 
 33 | .. autoclass:: SFTConfig
 34 | 
 35 | .. autoclass:: RWConfig
 36 | 
 37 | .. autoclass:: DPOConfig
 38 | 
 39 | .. autoclass:: GenerationHyperparameters
 40 | 
 41 | .. autoclass:: PPOHyperparameters
 42 | 
 43 | .. autoclass:: PPOConfig
 44 | 
 45 | .. autoclass:: GenerationConfig
 46 | 
 47 | **********************
 48 |  Model Configurations
 49 | **********************
 50 | 
 51 | .. autoclass:: ModelFamily
 52 | 
 53 | .. autoclass:: ModelTrainEvalConfig
 54 | 
 55 | .. autoclass:: OptimizerConfig
 56 | 
 57 | .. autoclass:: ParallelismConfig
 58 | 
 59 | .. autoclass:: MFCConfig
 60 | 
 61 | .. autoclass:: ReaLModelConfig
 62 | 
 63 | ************************
 64 |  Dataset Configurations
 65 | ************************
 66 | 
 67 | .. autoclass:: PromptAnswerDatasetConfig
 68 | 
 69 | .. autoclass:: PairedComparisonDatasetConfig
 70 | 
 71 | .. autoclass:: PromptOnlyDatasetConfig
 72 | 
 73 | ********************************************
 74 |  Data Structure for Interfaces and Datasets
 75 | ********************************************
 76 | 
 77 | .. autoclass:: realhf.SequenceSample
 78 |    :members:
 79 | 
 80 | ****************
 81 |  Dataflow Graph
 82 | ****************
 83 | 
 84 | .. autoclass:: realhf.MFCDef
 85 | 
 86 | *****************************
 87 |  System-Level Configurations
 88 | *****************************
 89 | 
 90 | .. note::
 91 | 
 92 |    These configurations are not supposed to be modified by users. They
 93 |    are used to help understand the code architecture of ReaL.
 94 | 
 95 | .. autoclass:: realhf.ModelShardID
 96 | 
 97 | .. autoclass:: realhf.ModelName
 98 | 
 99 | .. autoclass:: realhf.ModelVersion
100 | 
101 | .. autoclass:: realhf.Model
102 | 
103 | .. autoclass:: realhf.ModelBackend
104 |    :members:
105 |    :undoc-members: _initialize
106 | 
107 | .. autoclass:: realhf.PipelinableEngine
108 |    :members:
109 |    :undoc-members:
110 | 
111 | .. autoclass:: realhf.ModelInterface
112 |    :members:
113 |    :undoc-members:
114 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |    ReaL documentation master file, created by
 3 |    sphinx-quickstart on Mon Jun 10 10:57:12 2024.
 4 |    You can adapt this file completely to your liking, but it should at least
 5 |    contain the root `toctree` directive.
 6 | 
 7 | ##################################
 8 |  Welcome to ReaL's documentation!
 9 | ##################################
10 | 
11 | *****************
12 |  🚀 Get Started 🚀
13 | *****************
14 | 
15 | For users new to ReaL, we recommend starting with the :doc:`quickstart`
16 | section to learn how to run simple experiments on a local node. If you
17 | have multiple nodes available, please read the :doc:`distributed`
18 | section to learn how to run experiments on a cluster. These tutorials
19 | cover the basic usage of the implemented algorithms in ReaL, including
20 | SFT, Reward Modeling, DPO, and PPO, and do not require understanding the
21 | code structure.
22 | 
23 | For advanced users, we recommend proceeding to the :doc:`customization`
24 | section to learn how to customize the algorithms and models in ReaL.
25 | This requires an understanding of how an algorithm and its experiment
26 | configuration are defined in ReaL (i.e., as a dataflow graph), but
27 | understanding the system-wide implementation (e.g., model workers) is
28 | not mandatory.
29 | 
30 | For potential developers, please refer to the :doc:`impl` and the
31 | :doc:`arch` sections for a deeper understanding of the system
32 | architecture.
33 | 
34 | Besides these illustrations, we present the reference manual of various
35 | configuration objects in the :doc:`expconfig` section, and a brief
36 | overview of the system architecture in the :doc:`intro` section.
37 | 
38 | **************
39 |  ⭐ Contents ⭐
40 | **************
41 | 
42 | .. toctree::
43 |    :maxdepth: 3
44 | 
45 |    intro
46 |    install
47 |    expconfig
48 |    quickstart
49 |    distributed
50 |    customization
51 |    impl
52 |    arch
53 | 
54 |    contributing
55 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
  1 | ##############
  2 |  Installation
  3 | ##############
  4 | 
  5 | ***************
  6 |  Docker Images
  7 | ***************
  8 | 
  9 | The easiest way to run ReaL is by using the provided Docker images. We
 10 | offer a CPU-only image for launching experiments and a runtime GPU image
 11 | for deployment in a cluster. The Dockerfile is also available in the
 12 | repository.
 13 | 
 14 | To pull the images, run:
 15 | 
 16 | .. code:: console
 17 | 
 18 |    $ docker pull docker.io/garrett4wade/real-cpu:22.04-0.3.0
 19 |    $ docker pull docker.io/garrett4wade/real-gpu:24.03-py3-0.3.0
 20 | 
 21 | The CPU image is built from "ubuntu:22.04" and the GPU image is built
 22 | from "nvcr.io/nvidia/pytorch:24.03-py3". You can check the latest docker
 23 | image version `here
 24 | <https://hub.docker.com/r/garrett4wade/real-gpu/tags>`_.
 25 | 
 26 | After pulling the Docker images, run your Docker container locally on a
 27 | GPU node with the following command:
 28 | 
 29 | .. code:: console
 30 | 
 31 |    $ docker run -it --rm --gpus all --mount type=bind,src=/path/outside/container,dst=/realhf garrett4wade/real-gpu:24.03-py3-0.3.0 bash
 32 | 
 33 | There is an editable installation at ``/realhf`` inside the container,
 34 | so your change to the code outside the container should automatically
 35 | takes effect.
 36 | 
 37 | *****************************
 38 |  Install From PyPI or Source
 39 | *****************************
 40 | 
 41 | If you prefer not to use the provided Docker image, you can also start
 42 | with an image provided by NVIDA (e.g.,
 43 | ``nvcr.io/nvidia/pytorch:24.03-py3``) and install ReaL from PyPI or from
 44 | the source.
 45 | 
 46 | .. note::
 47 | 
 48 |    We don't upload a pre-built wheel to PyPI, so the installation will
 49 |    require compiling the C++ and CUDA extensions. Control whether to
 50 |    install the extentions with environment variables ``REAL_CUDA`` and
 51 |    ``REAL_NO_EXT``.
 52 | 
 53 |    The CUDA extention will be installed only if ``REAL_CUDA`` is set to
 54 |    1. No extention will be installed if ``REAL_NO_EXT`` is set to 1.
 55 | 
 56 |    If you don't want to compile the extensions, please use the provided
 57 |    Docker images.
 58 | 
 59 | First, clone the repository and install all dependencies:
 60 | 
 61 | .. code:: console
 62 | 
 63 |    $ pip install -U pip
 64 |    $ git clone https://github.com/openpsi-project/ReaLHF
 65 |    $ cd ReaLHF
 66 |    $ pip install -r requirements.txt
 67 | 
 68 | On a GPU machine, also install the required runtime packages:
 69 | 
 70 | .. code:: console
 71 | 
 72 |    $ export MAX_JOBS=8  # Set the number of parallel jobs for compilation.
 73 |    $ pip install git+https://github.com/NVIDIA/TransformerEngine.git@v1.8 --no-deps --no-build-isolation
 74 |    $ pip install flash_attn==2.4.2 --no-build-isolation
 75 |    $ pip3 install git+https://github.com/tgale96/grouped_gemm.git@v0.1.4 --no-build-isolation --no-deps  # For MoE
 76 | 
 77 | .. note::
 78 | 
 79 |    ``MAX_JOBS`` sets the number of parallel jobs for compilation. A
 80 |    larger value will consume more memory (and potentially lead to OOM
 81 |    stuck) and CPU resources. Adjust the value according to your
 82 |    machine's specifications.
 83 | 
 84 | Install ReaLHF from source (recommended, for the latest build):
 85 | 
 86 | .. code:: console
 87 | 
 88 |    $ git clone https://github.com/openpsi-project/ReaLHF
 89 |    $ cd ReaLHF
 90 |    $ REAL_CUDA=1 pip install -e . --no-build-isolation
 91 | 
 92 | Or install from PyPI (for stable build):
 93 | 
 94 | .. code:: console
 95 | 
 96 |    $ REAL_CUDA=1 pip install realhf --no-build-isolation
 97 | 
 98 | The PyPI package allows you to launch existing experiments with the
 99 | quickstart command. If you want to modify the code, you must clone the
100 | source code and install it from the source.
101 | 
102 | Next, check :doc:`quickstart` for instructions on running experiments.
103 | 


--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
 1 | ##############
 2 |  Introduction
 3 | ##############
 4 | 
 5 | *********************************
 6 |  Limitations of Existing Systems
 7 | *********************************
 8 | 
 9 | We observe two major limitations based on our profiling of the previous
10 | RLHF systems, as shown in the :ref:`timeline`.
11 | 
12 | .. _timeline:
13 | 
14 | .. figure:: images/timeline.svg
15 |    :alt: timeline
16 | 
17 |    Timeline Figure
18 | 
19 |    Execution timelines of ReaL and existing systems based on profiling.
20 | 
21 | First, when models are distributed to every GPU node that applies the
22 | same parallelization strategy, such as in `DeepSpeed-Chat
23 | <https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat>`_,
24 | it is often over-parallelized. Over-parallelization leads to substantial
25 | synchronization and communication overhead (the light purple bars).
26 | 
27 | An alternative way is to assign different models to different GPU nodes,
28 | where models can execute concurrently, such as `OpenRLHF
29 | <https://github.com/OpenRLHF/OpenRLHF>`_. However, our second
30 | observation is that such asymmetric parallelization often causes
31 | under-utilization of the GPUs (e.g., the gray areas) because of the
32 | dependencies between tasks.
33 | 
34 | The key idea of ReaL is to enable dynamic **reallocation of model
35 | parameters** between GPUs to improve the efficiency of the entire RLHF
36 | training process.
37 | 
38 | By first choosing a parallelization strategy tailored for each
39 | computation workload (e.g., pipelining for Generation and tensor
40 | parallelism for Training) and then executing these calls concurrently
41 | with a smaller parallelization degree (e.g., Actor and Critic in
42 | Training), we can eliminate redundant communication while maximizing GPU
43 | utilization, effectively addressing the limitations of prior solutions.
44 | 
45 | ************************
46 |  Performance Comparison
47 | ************************
48 | 
49 | We show throughput comparison with the state-of-the-art open-source
50 | systems in the following figure.
51 | 
52 | (In the following figure, as the number of GPUs increases, the model
53 | size scales up from LLaMA 7B, LLaMA 13B, and CodeLLaMA 34B, to the
54 | largest LLaMA 70B.)
55 | 
56 | .. image:: images/vws.svg
57 | 
58 | .. _est_time_table:
59 | 
60 | +--------------+---------------+---------------+---------------+
61 | | System       | DeepSpeedChat | OpenRLHF      | ReaL          |
62 | +==============+===============+===============+===============+
63 | | Time (hours) | 141.5         | 152.8         | **17.0**      |
64 | +--------------+---------------+---------------+---------------+
65 | 
66 | We also show the estimated time for completing the entire full-scale
67 | 4*70B RLHF training process, composed of 4 iterations with 400 steps for
68 | each iteration as for LLaMA-2.
69 | 
70 | ..
71 |    "Scale Actor" maintains the sizes
72 | 
73 | ..
74 |    of Critic and Reward at 7B while increasing the sizes of Actor and Reference with the number of GPUs.
75 | 
76 | ..
77 |    "Scale Critic" follows the opposite approach, and
78 | 
79 | ..
80 |    "Scale Both" increases sizes of all models proportionately.
81 | 


--------------------------------------------------------------------------------
/examples/cluster_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cluster_type": "slurm",
 3 |     "cluster_name": "my_cluster",
 4 |     "fileroot": "/path/to/my/file/system",
 5 |     "default_mount": "/path/to/my/file/system:/path/to/my/file/system,/dev/infiniband:/dev/infiniband,/sys/class/infiniband_verbs:/sys/class/infiniband_verbs",
 6 |     "node_type_from_node_name": {
 7 |         "NODE\\d{2}$": "a100"
 8 |     },
 9 |     "gpu_type_from_node_name": {
10 |         "NODE\\d{2}$": "tesla"
11 |     },
12 |     "cpu_image": "garrett4wade/real-cpu",
13 |     "gpu_image": "garrett4wade/real-gpu",
14 |     "node_name_prefix": "NODE"
15 | }


--------------------------------------------------------------------------------
/examples/customized_exp/ppo_ref_ema.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import dataclasses
 3 | import math
 4 | import pprint
 5 | from typing import *
 6 | 
 7 | from realhf.api.core.dfg import ParamReallocHook
 8 | from realhf.api.core.system_api import ExperimentConfig
 9 | from realhf.api.quickstart.entrypoint import register_quickstart_exp
10 | from realhf.apps.quickstart import main
11 | from realhf.experiments.common.ppo_exp import PPOConfig
12 | from realhf.experiments.common.utils import resolve_replica_ids, resolve_rpc_hooks
13 | 
14 | 
15 | @dataclasses.dataclass
16 | class PPORefEMAConfig(PPOConfig):
17 |     ref_ema_eta: float = 0.001
18 | 
19 |     def initial_setup(self) -> ExperimentConfig:
20 |         rpc_allocs = self._get_rpc_allocations()
21 | 
22 |         resolve_replica_ids(rpc_allocs)
23 |         resolve_rpc_hooks(
24 |             rpc_allocs, self.models
25 |         )  # inplace modify MFCDefs in rpc allocations
26 | 
27 |         pprint.pprint(rpc_allocs)
28 | 
29 |         ######### The main difference from normal PPO #########
30 |         def _find_rpc(name):
31 |             return next(alloc.rpc for alloc in rpc_allocs if alloc.rpc.name == name)
32 | 
33 |         # Remove the offload hook of ref_inf, because
34 |         # we need to receive parameters from peer GPUs and update it immediately.
35 |         ref_inf = _find_rpc("ref_inf")
36 |         ref_inf._post_hooks = []
37 | 
38 |         # Add an unidirectional parameter reallocation hook.
39 |         actor_train = _find_rpc("actor_train")
40 |         actor_train.add_post_hook(
41 |             ParamReallocHook(
42 |                 target=ref_inf.model_name,
43 |                 eta=self.ref_ema_eta,
44 |             )
45 |         )
46 |         ######### The main difference from normal PPO #########
47 | 
48 |         model_worker = self._get_model_worker_configs(rpc_allocs)
49 | 
50 |         return ExperimentConfig(
51 |             exp_ctrl=self.exp_ctrl,
52 |             model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
53 |             model_worker=model_worker,
54 |         )
55 | 
56 | 
57 | register_quickstart_exp("ppo-ref-ema", PPORefEMAConfig)
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/examples/customized_exp/scripts/run_ppo_ref_ema.sh:
--------------------------------------------------------------------------------
 1 | MODEL_FAMILY=gpt2
 2 | 
 3 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/
 4 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY/default/epoch1epochstep15globalstep15/
 5 | 
 6 | MODE=local
 7 | EXP_NAME=quickstart-ppo
 8 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
 9 | 
10 | unset CLUSTER_SPEC_PATH
11 | python3 examples/customized_exp/ppo_ref_ema.py ppo-ref-ema \
12 |     mode=$MODE \
13 |     experiment_name=$EXP_NAME \
14 |     trial_name=$TRIAL_NAME \
15 |     exp_ctrl.total_train_epochs=1 \
16 |     exp_ctrl.save_freq_steps=null \
17 |     actor.type._class=$MODEL_FAMILY \
18 |     actor.path=$SFT_MODEL_PATH \
19 |     actor.optimizer.lr_scheduler_type=constant \
20 |     actor.optimizer.lr=1e-4 \
21 |     actor.optimizer.warmup_steps_proportion=0.0 \
22 |     critic.type._class=$MODEL_FAMILY \
23 |     critic.type.is_critic=True \
24 |     critic.path=$RW_MODEL_PATH \
25 |     ref.type._class=$MODEL_FAMILY \
26 |     ref.path=$SFT_MODEL_PATH \
27 |     rew.type._class=$MODEL_FAMILY \
28 |     rew.type.is_critic=True \
29 |     rew.path=$RW_MODEL_PATH \
30 |     dataset.path=.data/ppo_prompt.jsonl \
31 |     dataset.max_prompt_len=128 \
32 |     dataset.train_bs_n_seqs=128 \
33 |     ppo.gen.max_new_tokens=512 \
34 |     ppo.gen.min_new_tokens=512 \
35 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
36 |     ppo.ppo_n_minibatches=4 \
37 |     ppo.kl_ctl=0.1 \
38 |     ppo.value_eps_clip=0.2 \
39 |     ppo.reward_output_scaling=10.0 \
40 |     ppo.adv_norm=True ppo.value_norm=True \
41 |     allocation_mode=manual \
42 |     n_nodes=1 \
43 |     nodelist=\'NODE01\' \
44 |     actor_train.device_mesh=\'NODE01:0,1,2,3\' \
45 |     actor_train.parallel.data_parallel_size=2 \
46 |     actor_train.parallel.model_parallel_size=1 \
47 |     actor_train.parallel.pipeline_parallel_size=2 \
48 |     actor_gen.device_mesh=\'NODE01:0,1,2,3,4,5,6,7\' \
49 |     actor_gen.parallel.data_parallel_size=4 \
50 |     actor_gen.parallel.model_parallel_size=1 \
51 |     actor_gen.parallel.pipeline_parallel_size=2 \
52 |     critic_train.device_mesh=\'NODE01:4,5,6,7\' \
53 |     critic_train.parallel.data_parallel_size=2 \
54 |     critic_train.parallel.model_parallel_size=1 \
55 |     critic_train.parallel.pipeline_parallel_size=2 \
56 |     critic_inf.device_mesh=\'NODE01:0,1\' \
57 |     critic_inf.parallel.data_parallel_size=2 \
58 |     critic_inf.parallel.model_parallel_size=1 \
59 |     critic_inf.parallel.pipeline_parallel_size=1 \
60 |     rew_inf.device_mesh=\'NODE01:2,3\' \
61 |     rew_inf.parallel.data_parallel_size=1 \
62 |     rew_inf.parallel.model_parallel_size=1 \
63 |     rew_inf.parallel.pipeline_parallel_size=2 \
64 |     ref_inf.device_mesh=\'NODE01:4,5,6,7\' \
65 |     ref_inf.parallel.data_parallel_size=1 \
66 |     ref_inf.parallel.model_parallel_size=1 \
67 |     ref_inf.parallel.pipeline_parallel_size=4
68 | 


--------------------------------------------------------------------------------
/examples/customized_exp/scripts/run_ppo_sentiment.sh:
--------------------------------------------------------------------------------
 1 | MODEL_FAMILY=gpt2
 2 | 
 3 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/
 4 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY/default/epoch1epochstep15globalstep15/
 5 | 
 6 | MODE=local
 7 | EXP_NAME=quickstart-ppo
 8 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
 9 | 
10 | unset CLUSTER_SPEC_PATH
11 | python3 examples/customized_exp/ppo_sentiment.py my-ppo \
12 |     mode=$MODE \
13 |     experiment_name=$EXP_NAME \
14 |     trial_name=$TRIAL_NAME \
15 |     exp_ctrl.total_train_epochs=1 \
16 |     exp_ctrl.save_freq_steps=null \
17 |     actor.type._class=$MODEL_FAMILY \
18 |     actor.path=$SFT_MODEL_PATH \
19 |     actor.optimizer.lr_scheduler_type=constant \
20 |     actor.optimizer.lr=1e-4 \
21 |     actor.optimizer.warmup_steps_proportion=0.0 \
22 |     critic.type._class=$MODEL_FAMILY \
23 |     critic.type.is_critic=True \
24 |     critic.path=$RW_MODEL_PATH \
25 |     ref.type._class=$MODEL_FAMILY \
26 |     ref.path=$SFT_MODEL_PATH \
27 |     rew.type._class=$MODEL_FAMILY \
28 |     rew.type.is_critic=True \
29 |     rew.path=$RW_MODEL_PATH \
30 |     dataset.path=.data/ppo_prompt.jsonl \
31 |     dataset.max_prompt_len=128 \
32 |     dataset.train_bs_n_seqs=128 \
33 |     ppo.gen.max_new_tokens=512 \
34 |     ppo.gen.min_new_tokens=512 \
35 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
36 |     ppo.ppo_n_minibatches=4 \
37 |     ppo.kl_ctl=0.1 \
38 |     ppo.value_eps_clip=0.2 \
39 |     ppo.reward_output_scaling=10.0 \
40 |     ppo.adv_norm=True ppo.value_norm=True \
41 |     allocation_mode=manual \
42 |     n_nodes=1 \
43 |     nodelist=\'NODE01\' \
44 |     actor_train.device_mesh=\'NODE01:0,1,2,3\' \
45 |     actor_train.parallel.data_parallel_size=2 \
46 |     actor_train.parallel.model_parallel_size=1 \
47 |     actor_train.parallel.pipeline_parallel_size=2 \
48 |     actor_gen.device_mesh=\'NODE01:0,1,2,3,4,5,6,7\' \
49 |     actor_gen.parallel.data_parallel_size=4 \
50 |     actor_gen.parallel.model_parallel_size=1 \
51 |     actor_gen.parallel.pipeline_parallel_size=2 \
52 |     critic_train.device_mesh=\'NODE01:4,5,6,7\' \
53 |     critic_train.parallel.data_parallel_size=2 \
54 |     critic_train.parallel.model_parallel_size=1 \
55 |     critic_train.parallel.pipeline_parallel_size=2 \
56 |     critic_inf.device_mesh=\'NODE01:0,1\' \
57 |     critic_inf.parallel.data_parallel_size=2 \
58 |     critic_inf.parallel.model_parallel_size=1 \
59 |     critic_inf.parallel.pipeline_parallel_size=1 \
60 |     rew_inf.device_mesh=\'NODE01:2,3\' \
61 |     rew_inf.parallel.data_parallel_size=2 \
62 |     rew_inf.parallel.model_parallel_size=1 \
63 |     rew_inf.parallel.pipeline_parallel_size=1 \
64 |     ref_inf.device_mesh=\'NODE01:4,5,6,7\' \
65 |     ref_inf.parallel.data_parallel_size=1 \
66 |     ref_inf.parallel.model_parallel_size=1 \
67 |     ref_inf.parallel.pipeline_parallel_size=4
68 | 


--------------------------------------------------------------------------------
/examples/load_and_eval_rw.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | import transformers
 4 | 
 5 | from realhf.api.core.config import ModelName
 6 | from realhf.api.core.model_api import ReaLModelConfig
 7 | from realhf.base import constants
 8 | from realhf.base.testing import init_global_constants
 9 | 
10 | 
11 | def load_and_use_single_process(path: str, model_family_name: str):
12 |     # Initialize distributed environment.
13 |     dist.init_process_group(
14 |         "nccl", rank=0, world_size=1, init_method="tcp://localhost:7777"
15 |     )
16 |     model_name = ModelName("default", 0)
17 |     init_global_constants(
18 |         num_dp=1,
19 |         num_mp=1,
20 |         num_pp=1,
21 |         sequence_parallel=False,
22 |         model_name=model_name,
23 |     )
24 | 
25 |     # NOTE: import here to avoid CUDA re-initialization
26 |     from realhf.impl.model.nn.real_llm_api import ReaLModel, add_helper_functions
27 | 
28 |     # Call a method like `config_from_llama` to get the config.
29 |     mconfig: ReaLModelConfig = getattr(ReaLModel, f"config_from_{model_family_name}")(
30 |         transformers.AutoConfig.from_pretrained(path)
31 |     )
32 |     # IMPORTANT: Set the critic flag to True.
33 |     # Since the output head and the token embedding no long have the same shape,
34 |     # We set tied_embedding to be False.
35 |     mconfig.is_critic = True
36 |     mconfig.tied_embedding = False
37 | 
38 |     with constants.model_scope(model_name):
39 |         # Construct the model.
40 |         model = ReaLModel(mconfig, dtype=torch.float16, device="cuda")
41 |         model.instantiate()
42 | 
43 |         # Load the reward checkpoint
44 |         # Since the checkpoint is already critic model, we set
45 |         # init_critic_from_actor to be False.
46 |         model = getattr(model, f"from_{model_family_name}")(
47 |             path, init_critic_from_actor=False
48 |         )
49 |         # Add helper functions to make the model like HuggingFace models.
50 |         add_helper_functions(model)
51 | 
52 |         # Use the model.
53 |         bs = 10
54 |         seqlen = 256
55 |         input_ids = torch.randint(
56 |             0, mconfig.vocab_size, (bs, seqlen), dtype=torch.long, device="cuda"
57 |         )
58 |         attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
59 | 
60 |         # The final dimension of the output scores is 1.
61 |         scores = model(input_ids, attention_mask).logits
62 |         assert scores.shape == (bs, seqlen, 1), scores.shape
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     path = "/lustre/aigc/llm/checkpoints/fw/quickstart-rw/llama-ray-manual/default/epoch1epochstep10globalstep10/"
67 |     model_family_name = "llama"
68 |     load_and_use_single_process(path, model_family_name)
69 | 


--------------------------------------------------------------------------------
/examples/new_algorithms/grpo/grpo.sh:
--------------------------------------------------------------------------------
 1 | MODEL_FAMILY=llama
 2 | 
 3 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
 4 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/
 5 | 
 6 | MODE=local
 7 | 
 8 | EXP_NAME=quickstart-grpo
 9 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
10 | 
11 | python3 examples/new_algorithms/grpo/grpo_exp.py grpo \
12 |     mode=$MODE \
13 |     experiment_name=$EXP_NAME \
14 |     trial_name=$TRIAL_NAME \
15 |     exp_ctrl.total_train_epochs=8 \
16 |     exp_ctrl.save_freq_steps=null \
17 |     actor.type._class=$MODEL_FAMILY \
18 |     actor.path=$SFT_MODEL_PATH \
19 |     actor.optimizer.lr=1e-4 \
20 |     actor.optimizer.lr_scheduler_type=constant \
21 |     rew.type._class=$MODEL_FAMILY \
22 |     rew.type.is_critic=True \
23 |     rew.path=$RW_MODEL_PATH \
24 |     ref.type._class=$MODEL_FAMILY \
25 |     ref.path=$SFT_MODEL_PATH \
26 |     dataset.path=.data/ppo_prompt.jsonl \
27 |     dataset.max_prompt_len=128 \
28 |     dataset.train_bs_n_seqs=32 \
29 |     allocation_mode=heuristic \
30 |     n_nodes=1 \
31 |     ppo.gen.max_new_tokens=512 \
32 |     ppo.gen.min_new_tokens=512 \
33 |     ppo.gen.use_cuda_graph=True \
34 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
35 |     ppo.ppo_n_minibatches=4 \
36 |     ppo.reward_output_scaling=1.0 ppo.adv_norm=False
37 | 


--------------------------------------------------------------------------------
/examples/new_algorithms/reinforce/reinforce.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | # You can specify different model families for the SFT and the RW model, but you need to
 3 | # re-tokenize the sequences if necessary.
 4 | MODEL_FAMILY=gpt2
 5 | 
 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints.
 7 | # ReaL saves checkpoints with the same format as HuggingFace,
 8 | # so you don't need to convert or split checkpoints explicitly.
 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this
10 | # will not ensure the optimal algorithm performance.
11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/
12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY/default/epoch1epochstep15globalstep15/
13 | 
14 | # Option 1: The experiment runs locally with subprocesses.
15 | # MODE=local
16 | # Option 2: The experiment runs in a Ray cluster
17 | MODE=local
18 | # Option 3: The experiment runs in a SLURM + pyxis cluster
19 | # Using the slurm mode requires a cluster spec file
20 | # and setting CLUSTER_SPEC_PATH to the path of it.
21 | # MODE=slurm
22 | 
23 | # `experiment_name` and `trial_name` can be arbitrary.
24 | # Logs and saved checkpoints will be indexed by them.
25 | EXP_NAME=quickstart-reinforce
26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
27 | 
28 | # When using the "manual" allocation mode, the user should specify the device allocation
29 | # and parallel strategies for each model function calls.
30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
31 | # We provide a template in the following command and the user can modify it according to
32 | # the specific model and the available GPUs.
33 | 
34 | # The following command shows an example of manual allocation on two nodes,
35 | # but it can be modified according to the specific model and the available GPUs.
36 | unset CLUSTER_SPEC_PATH
37 | python3 examples/new_algorithms/reinforce/reinforce_exp.py reinforce \
38 |     mode=$MODE \
39 |     experiment_name=$EXP_NAME \
40 |     trial_name=$TRIAL_NAME \
41 |     exp_ctrl.total_train_epochs=8 \
42 |     exp_ctrl.save_freq_steps=null \
43 |     actor.type._class=$MODEL_FAMILY \
44 |     actor.path=$SFT_MODEL_PATH \
45 |     actor.optimizer.lr=1e-4 \
46 |     actor.optimizer.lr_scheduler_type=constant \
47 |     rew.type._class=$MODEL_FAMILY \
48 |     rew.type.is_critic=True \
49 |     rew.path=$RW_MODEL_PATH \
50 |     dataset.path=.data/ppo_prompt.jsonl \
51 |     dataset.max_prompt_len=128 \
52 |     dataset.train_bs_n_seqs=512 \
53 |     gen.max_new_tokens=512 \
54 |     gen.min_new_tokens=512 \
55 |     gen.use_cuda_graph=True \
56 |     gen.top_p=0.9 gen.top_k=5000 \
57 |     allocation_mode=manual \
58 |     n_nodes=1 \
59 |     nodelist=\'NODE01\' \
60 |     actor_train.device_mesh=\'NODE01:0,1,2,3,4,5,6,7\' \
61 |     actor_train.parallel.data_parallel_size=4 \
62 |     actor_train.parallel.model_parallel_size=1 \
63 |     actor_train.parallel.pipeline_parallel_size=2 \
64 |     sample_gen.device_mesh=\'NODE01:0,1,2,3\' \
65 |     sample_gen.parallel.data_parallel_size=2 \
66 |     sample_gen.parallel.model_parallel_size=1 \
67 |     sample_gen.parallel.pipeline_parallel_size=2 \
68 |     sample_rew_inf.device_mesh=\'NODE01:0,1,2,3\' \
69 |     sample_rew_inf.parallel.data_parallel_size=4 \
70 |     sample_rew_inf.parallel.model_parallel_size=1 \
71 |     sample_rew_inf.parallel.pipeline_parallel_size=1 \
72 |     greedy_gen.device_mesh=\'NODE01:4,5,6,7\' \
73 |     greedy_gen.parallel.data_parallel_size=2 \
74 |     greedy_gen.parallel.model_parallel_size=1 \
75 |     greedy_gen.parallel.pipeline_parallel_size=2 \
76 |     greedy_rew_inf.device_mesh=\'NODE01:4,5,6,7\' \
77 |     greedy_rew_inf.parallel.data_parallel_size=4 \
78 |     greedy_rew_inf.parallel.model_parallel_size=1 \
79 |     greedy_rew_inf.parallel.pipeline_parallel_size=1
80 | 


--------------------------------------------------------------------------------
/examples/profiling/allocations.jsonl:
--------------------------------------------------------------------------------
1 | {"data_parallel_size": 2, "model_parallel_size": 4, "pipeline_parallel_size": 1, "use_sequence_parallel": true}


--------------------------------------------------------------------------------
/examples/profiling/datasets.jsonl:
--------------------------------------------------------------------------------
1 | {"type_": "prompt", "args": {"max_length": 1024, "pad_to_max_length": true, "dataset_path": "/lustre/fw/datasets/imdb/rl/ppo_prompt.jsonl"}}


--------------------------------------------------------------------------------
/examples/profiling/interfaces.jsonl:
--------------------------------------------------------------------------------
1 | {"type_": "ppo_actor", "args": {"generation_config": {"max_new_tokens": 1024,"min_new_tokens": 1024,"use_cuda_graph": true,"force_no_logits_mask": true,"force_cudagraph_recapture": true,"top_p": 1.0,"top_k": 1000000},"enable_save": false,"n_minibatches": 8}}
2 | 


--------------------------------------------------------------------------------
/examples/profiling/models.jsonl:
--------------------------------------------------------------------------------
1 | {"type": {"_class": "llama"}, "path": "/lustre/public/pretrained_model_weights/Llama-2-7b-hf"}


--------------------------------------------------------------------------------
/examples/profiling/profile.sh:
--------------------------------------------------------------------------------
 1 | # The model to profile and its path.
 2 | MODEL_FAMILY=llama
 3 | SFT_MODEL_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
 4 | 
 5 | EXP_NAME=profile-example
 6 | TRIAL_NAME=test
 7 | 
 8 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json"
 9 | 
10 | # Setting REAL_DUMP_TRACE=1 enables execution trace provided by PyTorch.
11 | 
12 | # Setting REAL_DUMP_MEMORY=1 enables memory profiling provided by PyTorch.
13 | 
14 | # The dataset content doesn't matter, as long as it is a prompt-only dataset.
15 | # Each entry in the dataset should contain two keys "id" and "prompt".
16 | # By default we pad the prompt to the maximum length in the batch for accurate system-wise benchmark.
17 | # The loaded data will be processed by the "_mock_${handle_name}" method in the interface
18 | # to create mock data suited for the exact interface handle.
19 | 
20 | # "handle_name" can be "inference", "generate", or "train_step",
21 | # and the "interface_impl" specifies which registered interface implementation to run.
22 | # "interface_kwargs_json" is a JSON configuration of the interface.
23 | 
24 | # "allocations_jsonl" is a JSONL file that specifies the parallel strategies to profile.
25 | # If not specified, all parallel strategies under the given world size will be profiled.
26 | 
27 | # "n_mbs" specifies the number of micro-batches to profile.
28 | 
29 | # The total number of runs will be the product of the number of micro-batches and the number of parallel strategies,
30 | # all within the same experiment_name and trial_name. Instead of re-launching the whole experiment, workers will
31 | # be paused and reconfigured to run the next experiment setup.
32 | 
33 | REAL_DUMP_TRACE=1 REAL_DUMP_MEMORY=1 \
34 |     python3 -m realhf.apps.quickstart profile \
35 |     mode=local \
36 |     experiment_name=$EXP_NAME \
37 |     trial_name=$TRIAL_NAME \
38 |     exp_ctrl.benchmark_steps=3 \
39 |     exp_ctrl.save_freq_steps=null \
40 |     exp_ctrl.eval_freq_steps=null \
41 |     n_nodes=1 \
42 |     'handle_names=[train_step]' \
43 |     interfaces_jsonl=./examples/profiling/interfaces.jsonl \
44 |     models_jsonl=./examples/profiling/models.jsonl \
45 |     datasets_jsonl=./examples/profiling/datasets.jsonl \
46 |     allocations_jsonl=./examples/profiling/allocations.jsonl \
47 |     'n_mbs=[1, 2, 4]' \
48 |     'batch_sizes=[512]'
49 | 


--------------------------------------------------------------------------------
/examples/scripts/distributed_ray/dpo.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=gpt2
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint.
 5 | # The latter is the common practice.
 6 | # ReaL saves checkpoints with the same format as HuggingFace,
 7 | # so you don't need to convert or split checkpoints explicitly.
 8 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/
 9 | 
10 | # Option 1: The experiment runs locally with subprocesses.
11 | # MODE=local
12 | # Option 2: The experiment runs in a Ray cluster
13 | MODE=ray
14 | # Option 3: The experiment runs in a SLURM + pyxis cluster
15 | # Using the slurm mode requires a cluster spec file
16 | # and setting CLUSTER_SPEC_PATH to the path of it.
17 | # MODE=slurm
18 | 
19 | # `experiment_name` and `trial_name` can be arbitrary.
20 | # Logs and saved checkpoints will be indexed by them.
21 | EXP_NAME=quickstart-dpo
22 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
23 | 
24 | # We use the "manual" allocation mode here to manually specify the parallelism strategy of training
25 | # and inference. The parallel strategy for training prefers tensor-model parallelism while the
26 | # inference prefers pipeline parallelism, which are more efficient for their corresponding workloads.
27 | 
28 | # The `dpo` subcommand specifies that this is a DPO experiment.
29 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
30 | # Enable it if you want to save checkpoints.
31 | python3 -m realhf.apps.quickstart dpo \
32 |     mode=$MODE \
33 |     experiment_name=$EXP_NAME \
34 |     trial_name=$TRIAL_NAME \
35 |     exp_ctrl.total_train_epochs=2 \
36 |     exp_ctrl.save_freq_steps=null \
37 |     n_nodes=2 \
38 |     allocation_mode=manual \
39 |     actor.type._class=$MODEL_FAMILY \
40 |     actor.path=$PRETRAINED_PATH \
41 |     actor_train.parallel.pipeline_parallel_size=4 \
42 |     actor_train.parallel.model_parallel_size=1 \
43 |     actor_train.parallel.data_parallel_size=4 \
44 |     actor_train.parallel.use_sequence_parallel=True \
45 |     ref.type._class=$MODEL_FAMILY \
46 |     ref.path=$PRETRAINED_PATH \
47 |     ref_inf.parallel.pipeline_parallel_size=4 \
48 |     ref_inf.parallel.model_parallel_size=1 \
49 |     ref_inf.parallel.data_parallel_size=4 \
50 |     ref_inf.parallel.use_sequence_parallel=True \
51 |     dataset.train_path=.data/rm_paired-train.jsonl \
52 |     dataset.max_pairs_per_prompt=2 \
53 |     dataset.max_seqlen=1024 \
54 |     dataset.train_bs_n_seqs=512 \
55 |     dataset.valid_bs_n_seqs=512


--------------------------------------------------------------------------------
/examples/scripts/distributed_ray/ppo.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | # You can specify different model families for the SFT and the RW model, but you need to
 3 | # re-tokenize the sequences if necessary.
 4 | MODEL_FAMILY=llama
 5 | 
 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints.
 7 | # ReaL saves checkpoints with the same format as HuggingFace,
 8 | # so you don't need to convert or split checkpoints explicitly.
 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this
10 | # will not ensure the optimal algorithm performance.
11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/
13 | 
14 | # Option 1: The experiment runs locally with subprocesses.
15 | # MODE=local
16 | # Option 2: The experiment runs in a Ray cluster
17 | MODE=ray
18 | # Option 3: The experiment runs in a SLURM + pyxis cluster
19 | # Using the slurm mode requires a cluster spec file
20 | # and setting CLUSTER_SPEC_PATH to the path of it.
21 | # MODE=slurm
22 | 
23 | # `experiment_name` and `trial_name` can be arbitrary.
24 | # Logs and saved checkpoints will be indexed by them.
25 | EXP_NAME=quickstart-ppo
26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic
27 | 
28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc.
30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
31 | # ReaL will make full use of these available GPUs to design allocations.
32 | # This does not ensure the optimal throughput, but it is a good starting point.
33 | 
34 | # The `heuristic` allocation mode is not ensured to run with every model configurations.
35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work.
36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
37 | 
38 | # The `ppo` subcommand specifies that this is a PPO experiment.
39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
40 | # Enable it if you want to save checkpoints.
41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
43 | # It's the user's responsibility to tune them appropriately.
44 | python3 -m realhf.apps.quickstart ppo \
45 |     mode=$MODE \
46 |     experiment_name=$EXP_NAME \
47 |     trial_name=$TRIAL_NAME \
48 |     exp_ctrl.total_train_epochs=1 \
49 |     exp_ctrl.save_freq_steps=null \
50 |     n_nodes=4 \
51 |     allocation_mode=heuristic \
52 |     actor.type._class=$MODEL_FAMILY \
53 |     actor.path=$SFT_MODEL_PATH \
54 |     critic.type._class=$MODEL_FAMILY \
55 |     critic.type.is_critic=True \
56 |     critic.path=$RW_MODEL_PATH \
57 |     ref.type._class=$MODEL_FAMILY \
58 |     ref.path=$SFT_MODEL_PATH \
59 |     rew.type._class=$MODEL_FAMILY \
60 |     rew.type.is_critic=True \
61 |     rew.path=$RW_MODEL_PATH \
62 |     dataset.path=.data/ppo_prompt.jsonl \
63 |     dataset.max_prompt_len=128 \
64 |     dataset.train_bs_n_seqs=128 \
65 |     ppo.gen.max_new_tokens=512 \
66 |     ppo.gen.min_new_tokens=512 \
67 |     ppo.gen.use_cuda_graph=True \
68 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
69 |     ppo.ppo_n_minibatches=4 \
70 |     ppo.kl_ctl=0.1 \
71 |     ppo.value_eps_clip=0.2 \
72 |     ppo.reward_output_scaling=1.0 \
73 |     ppo.adv_norm=True ppo.value_norm=True


--------------------------------------------------------------------------------
/examples/scripts/distributed_ray/rw.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=llama
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint.
 5 | # The latter is the common practice.
 6 | # ReaL saves checkpoints with the same format as HuggingFace,
 7 | # so you don't need to convert or split checkpoints explicitly.
 8 | # HF pretrained checkpoint
 9 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
10 | # or SFT checkpoint
11 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/llama-local-manual/default/epoch7epochstep5globalstep50/
12 | 
13 | # Option 1: The experiment runs locally with subprocesses.
14 | # MODE=local
15 | # Option 2: The experiment runs in a Ray cluster
16 | MODE=ray
17 | # Option 3: The experiment runs in a SLURM + pyxis cluster
18 | # Using the slurm mode requires a cluster spec file
19 | # and setting CLUSTER_SPEC_PATH to the path of it.
20 | # MODE=slurm
21 | 
22 | # `experiment_name` and `trial_name` can be arbitrary.
23 | # Logs and saved checkpoints will be indexed by them.
24 | EXP_NAME=quickstart-rw
25 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
26 | 
27 | # We use the "manual" allocation mode here to manually specify the parallelism strategy,
28 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
29 | 
30 | # The `rw` subcommand specifies that this is a reward modeling experiment.
31 | # The reward modeling experiment converges very fast, so we set a smaller
32 | # `total_train_epochs` and `save_freq_steps` for demonstration.
33 | # Note that we set `model.type.is_critic=True` to initialize a reward model from the LLM
34 | # by re-initializing the LM head.
35 | python3 -m realhf.apps.quickstart rw \
36 |     mode=$MODE \
37 |     experiment_name=$EXP_NAME \
38 |     trial_name=$TRIAL_NAME \
39 |     exp_ctrl.total_train_epochs=2 \
40 |     exp_ctrl.save_freq_steps=10 \
41 |     exp_ctrl.eval_freq_epochs=1 \
42 |     model.optimizer.type=adam \
43 |     model.optimizer.lr_scheduler_type=cosine \
44 |     model.optimizer.lr=1e-5 \
45 |     model.optimizer.warmup_steps_proportion=0.02 \
46 |     model.type._class=$MODEL_FAMILY \
47 |     model.type.is_critic=True \
48 |     model.path=$PRETRAINED_PATH \
49 |     dataset.train_path=.data/rm_paired-train.jsonl \
50 |     dataset.valid_path=.data/rm_paired-valid.jsonl \
51 |     dataset.max_seqlen=1024 \
52 |     dataset.train_bs_n_seqs=512 \
53 |     dataset.valid_bs_n_seqs=512 \
54 |     allocation_mode=manual \
55 |     n_nodes=2 \
56 |     allocation.parallel.pipeline_parallel_size=2 \
57 |     allocation.parallel.model_parallel_size=2 \
58 |     allocation.parallel.data_parallel_size=4 \
59 |     allocation.parallel.use_sequence_parallel=True


--------------------------------------------------------------------------------
/examples/scripts/distributed_ray/sft.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=llama
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint.
 5 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
 6 | 
 7 | # Option 1: The experiment runs locally with subprocesses.
 8 | # MODE=local
 9 | # Option 2: The experiment runs in a Ray cluster
10 | MODE=ray
11 | # Option 3: The experiment runs in a SLURM + pyxis cluster
12 | # Using the slurm mode requires a cluster spec file
13 | # and setting CLUSTER_SPEC_PATH to the path of it.
14 | # MODE=slurm
15 | 
16 | # `experiment_name` and `trial_name` can be arbitrary.
17 | # Logs and saved checkpoints will be indexed by them.
18 | EXP_NAME=quickstart-sft
19 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
20 | 
21 | # We use the "manual" allocation mode here to manually specify the parallelism strategy,
22 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
23 | 
24 | # The `sft` subcommand specifies that this is a supervised fine-tuning experiment.
25 | python3 -m realhf.apps.quickstart sft \
26 |     mode=$MODE \
27 |     experiment_name=$EXP_NAME \
28 |     trial_name=$TRIAL_NAME \
29 |     exp_ctrl.total_train_epochs=8 \
30 |     exp_ctrl.save_freq_steps=50 \
31 |     exp_ctrl.eval_freq_epochs=1 \
32 |     model.optimizer.type=adam \
33 |     model.optimizer.lr_scheduler_type=cosine \
34 |     model.optimizer.lr=1e-5 \
35 |     model.optimizer.warmup_steps_proportion=0.02 \
36 |     model.type._class=$MODEL_FAMILY \
37 |     model.path=$PRETRAINED_PATH \
38 |     dataset.train_path=.data/sft_pos-train.jsonl \
39 |     dataset.valid_path=.data/sft_pos-train.jsonl \
40 |     dataset.max_seqlen=1024 \
41 |     dataset.train_bs_n_seqs=512 \
42 |     dataset.valid_bs_n_seqs=512 \
43 |     allocation_mode=manual \
44 |     n_nodes=4 \
45 |     allocation.parallel.pipeline_parallel_size=2 \
46 |     allocation.parallel.model_parallel_size=4 \
47 |     allocation.parallel.data_parallel_size=4 \
48 |     allocation.parallel.use_sequence_parallel=True


--------------------------------------------------------------------------------
/examples/scripts/distributed_slurm/dpo.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=gpt2
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint.
 5 | # The latter is the common practice.
 6 | # ReaL saves checkpoints with the same format as HuggingFace,
 7 | # so you don't need to convert or split checkpoints explicitly.
 8 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/
 9 | 
10 | # Option 1: The experiment runs locally with subprocesses.
11 | # MODE=local
12 | # Option 2: The experiment runs in a Ray cluster
13 | # MODE=ray
14 | # Option 3: The experiment runs in a SLURM + pyxis cluster
15 | # Using the slurm mode requires a cluster spec file
16 | # and setting CLUSTER_SPEC_PATH to the path of it.
17 | MODE=slurm
18 | 
19 | # `experiment_name` and `trial_name` can be arbitrary.
20 | # Logs and saved checkpoints will be indexed by them.
21 | EXP_NAME=quickstart-dpo
22 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
23 | 
24 | # We use the "manual" allocation mode here to manually specify the parallelism strategy of training
25 | # and inference. The parallel strategy for training prefers tensor-model parallelism while the
26 | # inference prefers pipeline parallelism, which are more efficient for their corresponding workloads.
27 | 
28 | # The `dpo` subcommand specifies that this is a DPO experiment.
29 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
30 | # Enable it if you want to save checkpoints.
31 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json"
32 | python3 -m realhf.apps.quickstart dpo \
33 |     mode=$MODE \
34 |     experiment_name=$EXP_NAME \
35 |     trial_name=$TRIAL_NAME \
36 |     exp_ctrl.total_train_epochs=2 \
37 |     exp_ctrl.save_freq_steps=null \
38 |     n_nodes=2 \
39 |     allocation_mode=manual \
40 |     actor.type._class=$MODEL_FAMILY \
41 |     actor.path=$PRETRAINED_PATH \
42 |     actor_train.parallel.pipeline_parallel_size=4 \
43 |     actor_train.parallel.model_parallel_size=1 \
44 |     actor_train.parallel.data_parallel_size=4 \
45 |     actor_train.parallel.use_sequence_parallel=True \
46 |     ref.type._class=$MODEL_FAMILY \
47 |     ref.path=$PRETRAINED_PATH \
48 |     ref_inf.parallel.pipeline_parallel_size=4 \
49 |     ref_inf.parallel.model_parallel_size=1 \
50 |     ref_inf.parallel.data_parallel_size=4 \
51 |     ref_inf.parallel.use_sequence_parallel=True \
52 |     dataset.train_path=.data/rm_paired-train.jsonl \
53 |     dataset.max_pairs_per_prompt=2 \
54 |     dataset.max_seqlen=1024 \
55 |     dataset.train_bs_n_seqs=512 \
56 |     dataset.valid_bs_n_seqs=512


--------------------------------------------------------------------------------
/examples/scripts/distributed_slurm/ppo.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | # You can specify different model families for the SFT and the RW model, but you need to
 3 | # re-tokenize the sequences if necessary.
 4 | MODEL_FAMILY=llama
 5 | 
 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints.
 7 | # ReaL saves checkpoints with the same format as HuggingFace,
 8 | # so you don't need to convert or split checkpoints explicitly.
 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this
10 | # will not ensure the optimal algorithm performance.
11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/
13 | 
14 | # Option 1: The experiment runs locally with subprocesses.
15 | # MODE=local
16 | # Option 2: The experiment runs in a Ray cluster
17 | # MODE=ray
18 | # Option 3: The experiment runs in a SLURM + pyxis cluster
19 | # Using the slurm mode requires a cluster spec file
20 | # and setting CLUSTER_SPEC_PATH to the path of it.
21 | MODE=slurm
22 | 
23 | # `experiment_name` and `trial_name` can be arbitrary.
24 | # Logs and saved checkpoints will be indexed by them.
25 | EXP_NAME=quickstart-ppo
26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic
27 | 
28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc.
30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
31 | # ReaL will make full use of these available GPUs to design allocations.
32 | # This does not ensure the optimal throughput, but it is a good starting point.
33 | 
34 | # The `heuristic` allocation mode is not ensured to run with every model configurations.
35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work.
36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
37 | 
38 | # The `ppo` subcommand specifies that this is a PPO experiment.
39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
40 | # Enable it if you want to save checkpoints.
41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
43 | # It's the user's responsibility to tune them appropriately.
44 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json"
45 | python3 -m realhf.apps.quickstart ppo \
46 |     mode=$MODE \
47 |     experiment_name=$EXP_NAME \
48 |     trial_name=$TRIAL_NAME \
49 |     exp_ctrl.total_train_epochs=1 \
50 |     exp_ctrl.save_freq_steps=null \
51 |     n_nodes=4 \
52 |     allocation_mode=heuristic \
53 |     actor.type._class=$MODEL_FAMILY \
54 |     actor.path=$SFT_MODEL_PATH \
55 |     critic.type._class=$MODEL_FAMILY \
56 |     critic.type.is_critic=True \
57 |     critic.path=$RW_MODEL_PATH \
58 |     ref.type._class=$MODEL_FAMILY \
59 |     ref.path=$SFT_MODEL_PATH \
60 |     rew.type._class=$MODEL_FAMILY \
61 |     rew.type.is_critic=True \
62 |     rew.path=$RW_MODEL_PATH \
63 |     dataset.path=.data/ppo_prompt.jsonl \
64 |     dataset.max_prompt_len=128 \
65 |     dataset.train_bs_n_seqs=128 \
66 |     ppo.gen.max_new_tokens=512 \
67 |     ppo.gen.min_new_tokens=512 \
68 |     ppo.gen.use_cuda_graph=True \
69 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
70 |     ppo.ppo_n_minibatches=4 \
71 |     ppo.kl_ctl=0.1 \
72 |     ppo.value_eps_clip=0.2 \
73 |     ppo.reward_output_scaling=1.0 \
74 |     ppo.adv_norm=True ppo.value_norm=True


--------------------------------------------------------------------------------
/examples/scripts/distributed_slurm/rw.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=llama
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint.
 5 | # The latter is the common practice.
 6 | # ReaL saves checkpoints with the same format as HuggingFace,
 7 | # so you don't need to convert or split checkpoints explicitly.
 8 | # HF pretrained checkpoint
 9 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
10 | # or SFT checkpoint
11 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/llama-local-manual/default/epoch7epochstep5globalstep50/
12 | 
13 | # Option 1: The experiment runs locally with subprocesses.
14 | # MODE=local
15 | # Option 2: The experiment runs in a Ray cluster
16 | # MODE=ray
17 | # Option 3: The experiment runs in a SLURM + pyxis cluster
18 | # Using the slurm mode requires a cluster spec file
19 | # and setting CLUSTER_SPEC_PATH to the path of it.
20 | MODE=slurm
21 | 
22 | # `experiment_name` and `trial_name` can be arbitrary.
23 | # Logs and saved checkpoints will be indexed by them.
24 | EXP_NAME=quickstart-rw
25 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
26 | 
27 | # We use the "manual" allocation mode here to manually specify the parallelism strategy,
28 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
29 | 
30 | # The `rw` subcommand specifies that this is a reward modeling experiment.
31 | # The reward modeling experiment converges very fast, so we set a smaller
32 | # `total_train_epochs` and `save_freq_steps` for demonstration.
33 | # Note that we set `model.type.is_critic=True` to initialize a reward model from the LLM
34 | # by re-initializing the LM head.
35 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json"
36 | python3 -m realhf.apps.quickstart rw \
37 |     mode=$MODE \
38 |     experiment_name=$EXP_NAME \
39 |     trial_name=$TRIAL_NAME \
40 |     exp_ctrl.total_train_epochs=2 \
41 |     exp_ctrl.save_freq_steps=10 \
42 |     exp_ctrl.eval_freq_epochs=1 \
43 |     model.optimizer.type=adam \
44 |     model.optimizer.lr_scheduler_type=cosine \
45 |     model.optimizer.lr=1e-5 \
46 |     model.optimizer.warmup_steps_proportion=0.02 \
47 |     model.type._class=$MODEL_FAMILY \
48 |     model.type.is_critic=True \
49 |     model.path=$PRETRAINED_PATH \
50 |     dataset.train_path=.data/rm_paired-train.jsonl \
51 |     dataset.valid_path=.data/rm_paired-valid.jsonl \
52 |     dataset.max_seqlen=1024 \
53 |     dataset.train_bs_n_seqs=512 \
54 |     dataset.valid_bs_n_seqs=512 \
55 |     allocation_mode=manual \
56 |     n_nodes=2 \
57 |     allocation.parallel.pipeline_parallel_size=2 \
58 |     allocation.parallel.model_parallel_size=2 \
59 |     allocation.parallel.data_parallel_size=4 \
60 |     allocation.parallel.use_sequence_parallel=True


--------------------------------------------------------------------------------
/examples/scripts/distributed_slurm/sft.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=llama
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint.
 5 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
 6 | 
 7 | # Option 1: The experiment runs locally with subprocesses.
 8 | # MODE=local
 9 | # Option 2: The experiment runs in a Ray cluster
10 | # MODE=ray
11 | # Option 3: The experiment runs in a SLURM + pyxis cluster
12 | # Using the slurm mode requires a cluster spec file
13 | # and setting CLUSTER_SPEC_PATH to the path of it.
14 | MODE=slurm
15 | 
16 | # `experiment_name` and `trial_name` can be arbitrary.
17 | # Logs and saved checkpoints will be indexed by them.
18 | EXP_NAME=quickstart-sft
19 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
20 | 
21 | # We use the "manual" allocation mode here to manually specify the parallelism strategy,
22 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
23 | 
24 | # The `sft` subcommand specifies that this is a supervised fine-tuning experiment.
25 | export CLUSTER_SPEC_PATH="/lustre/aigc/llm/cluster/qh.json"
26 | python3 -m realhf.apps.quickstart sft \
27 |     mode=$MODE \
28 |     experiment_name=$EXP_NAME \
29 |     trial_name=$TRIAL_NAME \
30 |     exp_ctrl.total_train_epochs=8 \
31 |     exp_ctrl.save_freq_steps=50 \
32 |     exp_ctrl.eval_freq_epochs=1 \
33 |     model.optimizer.type=adam \
34 |     model.optimizer.lr_scheduler_type=cosine \
35 |     model.optimizer.lr=1e-5 \
36 |     model.optimizer.warmup_steps_proportion=0.02 \
37 |     model.type._class=$MODEL_FAMILY \
38 |     model.path=$PRETRAINED_PATH \
39 |     dataset.train_path=.data/sft_pos-train.jsonl \
40 |     dataset.valid_path=.data/sft_pos-train.jsonl \
41 |     dataset.max_seqlen=1024 \
42 |     dataset.train_bs_n_seqs=512 \
43 |     dataset.valid_bs_n_seqs=512 \
44 |     allocation_mode=manual \
45 |     n_nodes=4 \
46 |     allocation.parallel.pipeline_parallel_size=2 \
47 |     allocation.parallel.model_parallel_size=4 \
48 |     allocation.parallel.data_parallel_size=4 \
49 |     allocation.parallel.use_sequence_parallel=True


--------------------------------------------------------------------------------
/examples/scripts/local/dpo.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=gpt2
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint.
 5 | # The latter is the common practice.
 6 | # ReaL saves checkpoints with the same format as HuggingFace,
 7 | # so you don't need to convert or split checkpoints explicitly.
 8 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY/default/epoch7epochstep5globalstep50/
 9 | 
10 | # Option 1: The experiment runs locally with subprocesses.
11 | MODE=local
12 | # Option 2: The experiment runs in a Ray cluster
13 | # MODE=ray
14 | # Option 3: The experiment runs in a SLURM + pyxis cluster
15 | # Using the slurm mode requires a cluster spec file
16 | # and setting CLUSTER_SPEC_PATH to the path of it.
17 | # MODE=slurm
18 | 
19 | # `experiment_name` and `trial_name` can be arbitrary.
20 | # Logs and saved checkpoints will be indexed by them.
21 | EXP_NAME=quickstart-dpo
22 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
23 | 
24 | # We use the "manual" allocation mode here to manually specify the parallelism strategy of training
25 | # and inference. The parallel strategy for training prefers tensor-model parallelism while the
26 | # inference prefers pipeline parallelism, which are more efficient for their corresponding workloads.
27 | 
28 | # The `dpo` subcommand specifies that this is a DPO experiment.
29 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
30 | # Enable it if you want to save checkpoints.
31 | python3 -m realhf.apps.quickstart dpo \
32 |     mode=$MODE \
33 |     experiment_name=$EXP_NAME \
34 |     trial_name=$TRIAL_NAME \
35 |     exp_ctrl.total_train_epochs=2 \
36 |     exp_ctrl.save_freq_steps=null \
37 |     n_nodes=1 \
38 |     allocation_mode=manual \
39 |     actor.type._class=$MODEL_FAMILY \
40 |     actor.path=$PRETRAINED_PATH \
41 |     actor_train.parallel.pipeline_parallel_size=2 \
42 |     actor_train.parallel.model_parallel_size=1 \
43 |     actor_train.parallel.data_parallel_size=4 \
44 |     actor_train.parallel.use_sequence_parallel=True \
45 |     ref.type._class=$MODEL_FAMILY \
46 |     ref.path=$PRETRAINED_PATH \
47 |     ref_inf.parallel.pipeline_parallel_size=4 \
48 |     ref_inf.parallel.model_parallel_size=1 \
49 |     ref_inf.parallel.data_parallel_size=2 \
50 |     ref_inf.parallel.use_sequence_parallel=True \
51 |     dataset.train_path=.data/rm_paired-train.jsonl \
52 |     dataset.max_pairs_per_prompt=2 \
53 |     dataset.max_seqlen=1024 \
54 |     dataset.train_bs_n_seqs=512 \
55 |     dataset.valid_bs_n_seqs=512


--------------------------------------------------------------------------------
/examples/scripts/local/gen.sh:
--------------------------------------------------------------------------------
 1 | MODEL_FAMILY=llama
 2 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
 3 | 
 4 | MODE=local
 5 | 
 6 | EXP_NAME=quickstart-gen
 7 | TRIAL_NAME=$MODEL_FAMILY-$MODE
 8 | 
 9 | python3 -m realhf.apps.quickstart gen \
10 |     mode=$MODE \
11 |     experiment_name=$EXP_NAME \
12 |     trial_name=$TRIAL_NAME \
13 |     exp_ctrl.total_train_epochs=1 \
14 |     exp_ctrl.save_freq_steps=null \
15 |     n_nodes=1 \
16 |     allocation_mode=manual \
17 |     model.type._class=$MODEL_FAMILY \
18 |     model.path=$SFT_MODEL_PATH \
19 |     dataset.path=.data/ppo_prompt.jsonl \
20 |     dataset.max_prompt_len=1024 \
21 |     dataset.train_bs_n_seqs=100 \
22 |     allocation.parallel.pipeline_parallel_size=1 \
23 |     allocation.parallel.model_parallel_size=2 \
24 |     allocation.parallel.data_parallel_size=4 \
25 |     gen.max_new_tokens=1024 \
26 |     gen.min_new_tokens=1024 \
27 |     gen.use_cuda_graph=True \
28 |     gen.top_p=0.9 gen.top_k=1000


--------------------------------------------------------------------------------
/examples/scripts/local/ppo.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | # You can specify different model families for the SFT and the RW model, but you need to
 3 | # re-tokenize the sequences if necessary.
 4 | MODEL_FAMILY=llama
 5 | 
 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints.
 7 | # ReaL saves checkpoints with the same format as HuggingFace,
 8 | # so you don't need to convert or split checkpoints explicitly.
 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this
10 | # will not ensure the optimal algorithm performance.
11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/
13 | 
14 | # Option 1: The experiment runs locally with subprocesses.
15 | MODE=local
16 | # Option 2: The experiment runs in a Ray cluster
17 | # MODE=ray
18 | # Option 3: The experiment runs in a SLURM + pyxis cluster
19 | # Using the slurm mode requires a cluster spec file
20 | # and setting CLUSTER_SPEC_PATH to the path of it.
21 | # MODE=slurm
22 | 
23 | # `experiment_name` and `trial_name` can be arbitrary.
24 | # Logs and saved checkpoints will be indexed by them.
25 | EXP_NAME=quickstart-ppo
26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic
27 | 
28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc.
30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
31 | # ReaL will make full use of these available GPUs to design allocations.
32 | # This does not ensure the optimal throughput, but it is a good starting point.
33 | 
34 | # The `heuristic` allocation mode is not ensured to run with every model configurations.
35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work.
36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
37 | 
38 | # The `ppo` subcommand specifies that this is a PPO experiment.
39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
40 | # Enable it if you want to save checkpoints.
41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
43 | # It's the user's responsibility to tune them appropriately.
44 | python3 -m realhf.apps.quickstart ppo \
45 |     mode=$MODE \
46 |     experiment_name=$EXP_NAME \
47 |     trial_name=$TRIAL_NAME \
48 |     exp_ctrl.total_train_epochs=1 \
49 |     exp_ctrl.save_freq_steps=null \
50 |     n_nodes=1 \
51 |     allocation_mode=heuristic \
52 |     actor.type._class=$MODEL_FAMILY \
53 |     actor.path=$SFT_MODEL_PATH \
54 |     critic.type._class=$MODEL_FAMILY \
55 |     critic.type.is_critic=True \
56 |     critic.path=$RW_MODEL_PATH \
57 |     ref.type._class=$MODEL_FAMILY \
58 |     ref.path=$SFT_MODEL_PATH \
59 |     rew.type._class=$MODEL_FAMILY \
60 |     rew.type.is_critic=True \
61 |     rew.path=$RW_MODEL_PATH \
62 |     dataset.path=.data/ppo_prompt.jsonl \
63 |     dataset.max_prompt_len=128 \
64 |     dataset.train_bs_n_seqs=128 \
65 |     ppo.gen.max_new_tokens=512 \
66 |     ppo.gen.min_new_tokens=512 \
67 |     ppo.gen.use_cuda_graph=True \
68 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
69 |     ppo.ppo_n_minibatches=4 \
70 |     ppo.kl_ctl=0.1 \
71 |     ppo.value_eps_clip=0.2 \
72 |     ppo.reward_output_scaling=1.0 \
73 |     ppo.adv_norm=True ppo.value_norm=True


--------------------------------------------------------------------------------
/examples/scripts/local/ppo_minibatched.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | # You can specify different model families for the SFT and the RW model, but you need to
 3 | # re-tokenize the sequences if necessary.
 4 | MODEL_FAMILY=llama
 5 | 
 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints.
 7 | # ReaL saves checkpoints with the same format as HuggingFace,
 8 | # so you don't need to convert or split checkpoints explicitly.
 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this
10 | # will not ensure the optimal algorithm performance.
11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/
13 | 
14 | # Option 1: The experiment runs locally with subprocesses.
15 | MODE=local
16 | # Option 2: The experiment runs in a Ray cluster
17 | # MODE=ray
18 | # Option 3: The experiment runs in a SLURM + pyxis cluster
19 | # Using the slurm mode requires a cluster spec file
20 | # and setting CLUSTER_SPEC_PATH to the path of it.
21 | # MODE=slurm
22 | 
23 | # `experiment_name` and `trial_name` can be arbitrary.
24 | # Logs and saved checkpoints will be indexed by them.
25 | EXP_NAME=quickstart-ppo
26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-heuristic
27 | 
28 | # We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
29 | # for each model function call, i.e., actor generation, critic inference, actor train, etc.
30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
31 | # ReaL will make full use of these available GPUs to design allocations.
32 | # This does not ensure the optimal throughput, but it is a good starting point.
33 | 
34 | # The `heuristic` allocation mode is not ensured to run with every model configurations.
35 | # For example, if the vocabulary size is an odd number, the model parallelism may not work.
36 | # In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
37 | 
38 | # The `ppo` subcommand specifies that this is a PPO experiment.
39 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
40 | # Enable it if you want to save checkpoints.
41 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
42 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
43 | # It's the user's responsibility to tune them appropriately.
44 | python3 -m realhf.apps.quickstart ppo \
45 |     mode=$MODE \
46 |     experiment_name=$EXP_NAME \
47 |     trial_name=$TRIAL_NAME \
48 |     exp_ctrl.total_train_epochs=1 \
49 |     exp_ctrl.save_freq_steps=null \
50 |     n_nodes=1 \
51 |     allocation_mode=heuristic \
52 |     actor.type._class=$MODEL_FAMILY \
53 |     actor.path=$SFT_MODEL_PATH \
54 |     critic.type._class=$MODEL_FAMILY \
55 |     critic.type.is_critic=True \
56 |     critic.path=$RW_MODEL_PATH \
57 |     ref.type._class=$MODEL_FAMILY \
58 |     ref.path=$SFT_MODEL_PATH \
59 |     rew.type._class=$MODEL_FAMILY \
60 |     rew.type.is_critic=True \
61 |     rew.path=$RW_MODEL_PATH \
62 |     dataset.path=.data/ppo_prompt.jsonl \
63 |     dataset.max_prompt_len=128 \
64 |     dataset.train_bs_n_seqs=1024 \
65 |     ppo.gen.max_new_tokens=512 \
66 |     ppo.gen.min_new_tokens=512 \
67 |     ppo.gen.use_cuda_graph=True \
68 |     ppo.gen.force_no_logits_mask=True \
69 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
70 |     ppo.ppo_n_minibatches=4 \
71 |     ppo.kl_ctl=0.1 \
72 |     ppo.value_eps_clip=0.2 \
73 |     ppo.reward_output_scaling=1.0 \
74 |     ppo.adv_norm=True ppo.value_norm=True \
75 |     actor_gen.n_mbs=2 \
76 |     actor_train.n_mbs=4 \
77 |     critic_inf.n_mbs=4 \
78 |     critic_train.n_mbs=4 \
79 |     rew_inf.n_mbs=2 \
80 |     ref_inf.n_mbs=8


--------------------------------------------------------------------------------
/examples/scripts/local/ppo_symm.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | # You can specify different model families for the SFT and the RW model, but you need to
 3 | # re-tokenize the sequences if necessary.
 4 | MODEL_FAMILY=llama
 5 | 
 6 | # SFT_MODEL_PATH and RW_MODEL_PATH are the saved SFT and RW checkpoints.
 7 | # ReaL saves checkpoints with the same format as HuggingFace,
 8 | # so you don't need to convert or split checkpoints explicitly.
 9 | # You can also directly use the pre-trained HuggingFace checkpoint, but this
10 | # will not ensure the optimal algorithm performance.
11 | SFT_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/$MODEL_FAMILY-local-manual/default/epoch7epochstep5globalstep50/
12 | RW_MODEL_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-rw/$MODEL_FAMILY-ray-manual/default/epoch1epochstep10globalstep10/
13 | 
14 | # Option 1: The experiment runs locally with subprocesses.
15 | MODE=local
16 | # Option 2: The experiment runs in a Ray cluster
17 | # MODE=ray
18 | # Option 3: The experiment runs in a SLURM + pyxis cluster
19 | # Using the slurm mode requires a cluster spec file
20 | # and setting CLUSTER_SPEC_PATH to the path of it.
21 | # MODE=slurm
22 | 
23 | # `experiment_name` and `trial_name` can be arbitrary.
24 | # Logs and saved checkpoints will be indexed by them.
25 | EXP_NAME=quickstart-ppo
26 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
27 | 
28 | # When using the "manual" allocation mode, the user should specify the device allocation
29 | # and parallel strategies for each model function calls.
30 | # The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
31 | # We provide a template in the following command and the user can modify it according to
32 | # the specific model and the available GPUs.
33 | 
34 | # The `ppo` subcommand specifies that this is a PPO experiment.
35 | # The `save_freq_steps` is set to `null` to disable saving checkpoints.
36 | # Enable it if you want to save checkpoints.
37 | # The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
38 | # Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
39 | # It's the user's responsibility to tune them appropriately.
40 | # The allocation of model function calls is specified by a pattern `hostname:gpu_id1,gpu_id2,...`
41 | # for slicing GPUS of a single node, and `hostname1,hostname2` for multiple nodes.
42 | # Only 1, 2, 4, 8 GPUs on a single node or multiple complete nodes (e.g., 16, 24) are supported.
43 | # If the CLUSTER_SPEC_PATH is not set, `hostname`s are NODE01, NODE02, etc, otherwise it's the
44 | # hostname specified in this file. The `gpu_id`s are the GPU indices on the host,
45 | # from 0 to `n_gpus_per_node` (defaults to 8, can be changed) - 1.
46 | # Once allocations are all set, parallel strategies can be specified as long as the world size
47 | # equals to the number of GPUs in the allocation.
48 | 
49 | # The following command shows an example of manual allocation on two nodes,
50 | # but it can be modified according to the specific model and the available GPUs.
51 | unset CLUSTER_SPEC_PATH
52 | python3 -m realhf.apps.quickstart ppo \
53 |     mode=$MODE \
54 |     experiment_name=$EXP_NAME \
55 |     trial_name=$TRIAL_NAME \
56 |     exp_ctrl.total_train_epochs=1 \
57 |     exp_ctrl.save_freq_steps=null \
58 |     actor.type._class=$MODEL_FAMILY \
59 |     actor.path=$SFT_MODEL_PATH \
60 |     actor.optimizer.lr_scheduler_type=constant \
61 |     actor.optimizer.lr=1e-4 \
62 |     actor.optimizer.warmup_steps_proportion=0.0 \
63 |     critic.type._class=$MODEL_FAMILY \
64 |     critic.type.is_critic=True \
65 |     critic.path=$RW_MODEL_PATH \
66 |     ref.type._class=$MODEL_FAMILY \
67 |     ref.path=$SFT_MODEL_PATH \
68 |     rew.type._class=$MODEL_FAMILY \
69 |     rew.type.is_critic=True \
70 |     rew.path=$RW_MODEL_PATH \
71 |     dataset.path=.data/ppo_prompt.jsonl \
72 |     dataset.max_prompt_len=128 \
73 |     dataset.train_bs_n_seqs=128 \
74 |     ppo.gen.max_new_tokens=512 \
75 |     ppo.gen.min_new_tokens=512 \
76 |     ppo.gen.top_p=0.9 ppo.gen.top_k=1000 \
77 |     ppo.ppo_n_minibatches=4 \
78 |     ppo.kl_ctl=0.1 \
79 |     ppo.value_eps_clip=0.2 \
80 |     ppo.reward_output_scaling=10.0 \
81 |     ppo.adv_norm=True ppo.value_norm=True \
82 |     allocation_mode=m2d2p2 \
83 |     actor_gen.n_mbs=2 \
84 |     actor_train.n_mbs=4 \
85 |     ref_inf.n_mbs=2
86 | 


--------------------------------------------------------------------------------
/examples/scripts/local/rw.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=llama
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint or the saved SFT checkpoint.
 5 | # The latter is the common practice.
 6 | # ReaL saves checkpoints with the same format as HuggingFace,
 7 | # so you don't need to convert or split checkpoints explicitly.
 8 | # HF pretrained checkpoint
 9 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
10 | # or SFT checkpoint
11 | PRETRAINED_PATH=/lustre/aigc/llm/checkpoints/fw/quickstart-sft/llama-local-manual/default/epoch7epochstep5globalstep50/
12 | 
13 | # Option 1: The experiment runs locally with subprocesses.
14 | MODE=local
15 | # Option 2: The experiment runs in a Ray cluster
16 | # MODE=ray
17 | # Option 3: The experiment runs in a SLURM + pyxis cluster
18 | # Using the slurm mode requires a cluster spec file
19 | # and setting CLUSTER_SPEC_PATH to the path of it.
20 | # MODE=slurm
21 | 
22 | # `experiment_name` and `trial_name` can be arbitrary.
23 | # Logs and saved checkpoints will be indexed by them.
24 | EXP_NAME=quickstart-rw
25 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
26 | 
27 | # We use the "manual" allocation mode here to manually specify the parallelism strategy,
28 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
29 | 
30 | # The `rw` subcommand specifies that this is a reward modeling experiment.
31 | # The reward modeling experiment converges very fast, so we set a smaller
32 | # `total_train_epochs` and `save_freq_steps` for demonstration.
33 | # Note that we set `model.type.is_critic=True` to initialize a reward model from the LLM
34 | # by re-initializing the LM head.
35 | python3 -m realhf.apps.quickstart rw \
36 |     mode=$MODE \
37 |     experiment_name=$EXP_NAME \
38 |     trial_name=$TRIAL_NAME \
39 |     exp_ctrl.total_train_epochs=2 \
40 |     exp_ctrl.save_freq_steps=10 \
41 |     exp_ctrl.eval_freq_epochs=1 \
42 |     model.optimizer.type=adam \
43 |     model.optimizer.lr_scheduler_type=cosine \
44 |     model.optimizer.lr=1e-5 \
45 |     model.optimizer.warmup_steps_proportion=0.02 \
46 |     model.type._class=$MODEL_FAMILY \
47 |     model.type.is_critic=True \
48 |     model.path=$PRETRAINED_PATH \
49 |     dataset.train_path=.data/rm_paired-train.jsonl \
50 |     dataset.valid_path=.data/rm_paired-valid.jsonl \
51 |     dataset.max_seqlen=1024 \
52 |     dataset.train_bs_n_seqs=512 \
53 |     dataset.valid_bs_n_seqs=512 \
54 |     allocation_mode=manual \
55 |     n_nodes=1 \
56 |     allocation.parallel.pipeline_parallel_size=2 \
57 |     allocation.parallel.model_parallel_size=2 \
58 |     allocation.parallel.data_parallel_size=2 \
59 |     allocation.parallel.use_sequence_parallel=True


--------------------------------------------------------------------------------
/examples/scripts/local/sft.sh:
--------------------------------------------------------------------------------
 1 | # MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
 2 | MODEL_FAMILY=llama
 3 | 
 4 | # PRETRAINED_PATH is the HuggingFace checkpoint.
 5 | PRETRAINED_PATH=/lustre/public/pretrained_model_weights/Llama-2-7b-hf
 6 | 
 7 | # Option 1: The experiment runs locally with subprocesses.
 8 | MODE=local
 9 | # Option 2: The experiment runs in a Ray cluster
10 | # MODE=ray
11 | # Option 3: The experiment runs in a SLURM + pyxis cluster
12 | # Using the slurm mode requires a cluster spec file
13 | # and setting CLUSTER_SPEC_PATH to the path of it.
14 | # MODE=slurm
15 | 
16 | # `experiment_name` and `trial_name` can be arbitrary.
17 | # Logs and saved checkpoints will be indexed by them.
18 | EXP_NAME=quickstart-sft
19 | TRIAL_NAME=$MODEL_FAMILY-$MODE-manual
20 | 
21 | # We use the "manual" allocation mode here to manually specify the parallelism strategy,
22 | # which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
23 | 
24 | # The `sft` subcommand specifies that this is a supervised fine-tuning experiment.
25 | python3 -m realhf.apps.quickstart sft \
26 |     mode=$MODE \
27 |     experiment_name=$EXP_NAME \
28 |     trial_name=$TRIAL_NAME \
29 |     exp_ctrl.total_train_epochs=8 \
30 |     exp_ctrl.save_freq_steps=50 \
31 |     exp_ctrl.eval_freq_epochs=1 \
32 |     model.optimizer.type=adam \
33 |     model.optimizer.lr_scheduler_type=cosine \
34 |     model.optimizer.lr=1e-5 \
35 |     model.optimizer.warmup_steps_proportion=0.02 \
36 |     model.type._class=$MODEL_FAMILY \
37 |     model.path=$PRETRAINED_PATH \
38 |     dataset.train_path=.data/sft_pos-train.jsonl \
39 |     dataset.valid_path=.data/sft_pos-train.jsonl \
40 |     dataset.max_seqlen=1024 \
41 |     dataset.train_bs_n_seqs=2048 \
42 |     dataset.valid_bs_n_seqs=512 \
43 |     allocation_mode=manual \
44 |     n_nodes=1 \
45 |     allocation.parallel.pipeline_parallel_size=2 \
46 |     allocation.parallel.model_parallel_size=2 \
47 |     allocation.parallel.data_parallel_size=2 \
48 |     allocation.parallel.use_sequence_parallel=True


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "packaging", "torch", "pybind11>=2.10.0", "build>=1.2.1"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "realhf"
 7 | description = "ReaL: Efficient RLHF Training of Large Language Models with Parameter Reallocation"
 8 | readme = "README.md"
 9 | requires-python = ">=3.10,<3.12"
10 | dynamic = ["version"]
11 | authors = [
12 |     { name = "Zhiyu Mei", email = "meizy20@mails.tsinghua.edu.cn" },
13 |     { name = "Wei Fu", email = "fuwth17@gmail.com" },
14 | ]
15 | maintainers = [
16 |     { name = "Zhiyu Mei", email = "meizy20@mails.tsinghua.edu.cn" },
17 |     { name = "Wei Fu", email = "fuwth17@gmail.com" },
18 | ]
19 | keywords = [
20 |     "distributed-systems",
21 |     "reinforcement-learning-from-human-feedback",
22 |     "large-language-models",
23 |     "llm-training",
24 | ]
25 | classifiers = [
26 |     #   3 - Alpha
27 |     #   4 - Beta
28 |     #   5 - Production/Stable
29 |     "Development Status :: 2 - Pre-Alpha",
30 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
31 |     "Intended Audience :: Developers",
32 |     "Programming Language :: Python :: 3",
33 |     "Programming Language :: Python :: 3.10",
34 | ]
35 | 
36 | [project.urls]
37 | Homepage = "https://github.com/openpsi-project/ReaLHF"
38 | Issues = "https://github.com/openpsi-project/ReaLHF/issues"
39 | Documentation = "https://openpsi-project.github.io/ReaLHF/"
40 | Repository = "https://github.com/openpsi-project/ReaLHF"
41 | 
42 | [tool.setuptools.dynamic]
43 | version = {attr = "realhf.__version__"}
44 | 
45 | [tool.setuptools.packages.find]
46 | where = ["."] # ["."] by default
47 | # include = ["csrc/*", "realhf/*"] # ["*"] by default
48 | # exclude = ["tests", "docker"]    # empty by default
49 | # namespaces = false               # true by default
50 | 
51 | [tool.isort]
52 | profile = "black"
53 | 
54 | [tool.pytest.ini_options]
55 | pythonpath = ["."]
56 | 
57 | [tool.black]
58 | line-length = 88
59 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning
4 |     ignore::UserWarning


--------------------------------------------------------------------------------
/realhf/__init__.py:
--------------------------------------------------------------------------------
 1 | # Re-import these classes for clear documentation,
 2 | # otherwise the name will have a long prefix like
 3 | # realhf.api.quickstart.model.ModelTrainEvalConfig.
 4 | from .api.core.config import ModelFamily, ModelName, ModelShardID
 5 | from .api.core.data_api import SequenceSample
 6 | from .api.core.dfg import MFCDef
 7 | from .api.core.model_api import (
 8 |     FinetuneSpec,
 9 |     GenerationHyperparameters,
10 |     Model,
11 |     ModelBackend,
12 |     ModelInterface,
13 |     ModelVersion,
14 |     PipelinableEngine,
15 |     ReaLModelConfig,
16 | )
17 | from .api.quickstart.dataset import (
18 |     PairedComparisonDatasetConfig,
19 |     PromptAnswerDatasetConfig,
20 |     PromptOnlyDatasetConfig,
21 | )
22 | from .api.quickstart.device_mesh import MFCConfig
23 | from .api.quickstart.model import (
24 |     ModelTrainEvalConfig,
25 |     OptimizerConfig,
26 |     ParallelismConfig,
27 | )
28 | from .experiments.common.common import CommonExperimentConfig, ExperimentSaveEvalControl
29 | from .experiments.common.dpo_exp import DPOConfig
30 | from .experiments.common.gen_exp import GenerationConfig
31 | from .experiments.common.ppo_exp import PPOConfig, PPOHyperparameters
32 | from .experiments.common.rw_exp import RWConfig
33 | from .experiments.common.sft_exp import SFTConfig
34 | 
35 | __version__ = "0.3.0"
36 | 


--------------------------------------------------------------------------------
/realhf/api/from_hf/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | 
4 | from realhf.base.importing import import_module
5 | 
6 | import_module(os.path.dirname(__file__), re.compile(r"^(?!.*__init__).*\.py$"))
7 | 


--------------------------------------------------------------------------------
/realhf/api/from_hf/gemma.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import *
  3 | 
  4 | import torch
  5 | import transformers
  6 | 
  7 | from realhf.api.core.model_api import ReaLModelConfig, register_hf_family
  8 | from realhf.base.testing import (
  9 |     TESTING_MODEL_HEAD_DIM,
 10 |     TESTING_MODEL_HIDDEN_SIZE,
 11 |     TESTING_MODEL_INTERMEDIATE_SIZE,
 12 |     TESTING_MODEL_N_HEADS,
 13 |     TESTING_MODEL_N_LAYERS,
 14 |     TESTING_MODEL_N_POSITIONS,
 15 |     TESTING_MODEL_VOCAB_SIZE,
 16 | )
 17 | 
 18 | from .llama import (
 19 |     convert_state_dict_llama,
 20 |     llama_embedding_layer_names,
 21 |     llama_output_head_param_name,
 22 |     llama_transformer_block_param_name,
 23 |     to_llama_state_dict,
 24 | )
 25 | 
 26 | 
 27 | def convert_config_gemma(
 28 |     hf_config: transformers.GemmaConfig,
 29 | ) -> ReaLModelConfig:
 30 |     if hf_config.hidden_activation is None:
 31 |         act = "gelu_pytorch_tanh"
 32 |     else:
 33 |         act = hf_config.hidden_activation
 34 |     return ReaLModelConfig(
 35 |         n_layers=hf_config.num_hidden_layers,
 36 |         n_kv_heads=hf_config.num_key_value_heads,
 37 |         hidden_dim=hf_config.hidden_size,
 38 |         n_q_heads=hf_config.num_attention_heads,
 39 |         head_dim=hf_config.head_dim,
 40 |         intermediate_dim=hf_config.intermediate_size,
 41 |         vocab_size=hf_config.vocab_size,
 42 |         n_positions=hf_config.max_position_embeddings,
 43 |         embd_pdrop=0.0,
 44 |         attn_pdrop=(
 45 |             hf_config.attention_dropout
 46 |             if hasattr(hf_config, "attention_dropout")
 47 |             else 0.1
 48 |         ),
 49 |         layer_norm_epsilon=hf_config.rms_norm_eps,
 50 |         activation_function=act,  # NOTE: here is different than LLaMA
 51 |         use_attention_bias=hf_config.attention_bias,
 52 |         use_attn_proj_bias=hf_config.attention_bias,
 53 |         scale_attn_by_inverse_layer_idx=False,
 54 |         layer_norm_type="gemma",
 55 |         mlp_type="llama",
 56 |         apply_rotary=True,
 57 |         rotary_base=hf_config.rope_theta,
 58 |         rotary_interleaved=False,
 59 |         tied_embedding=hf_config.tie_word_embeddings,
 60 |         normalize_embed=True,
 61 |     )
 62 | 
 63 | 
 64 | def convert_config_back_gemma(
 65 |     config: ReaLModelConfig,
 66 | ) -> transformers.GemmaConfig:
 67 |     return transformers.GemmaConfig(
 68 |         vocab_size=config.vocab_size,
 69 |         hidden_size=config.hidden_dim,
 70 |         intermediate_size=config.intermediate_dim,
 71 |         num_hidden_layers=config.n_layers,
 72 |         num_key_value_heads=config.n_kv_heads,
 73 |         num_attention_heads=config.n_q_heads,
 74 |         head_dim=config.head_dim,
 75 |         max_position_embeddings=config.n_positions,
 76 |         rms_norm_eps=config.layer_norm_epsilon,
 77 |         hidden_act=config.activation_function,
 78 |         hidden_activation=config.activation_function,
 79 |         attention_bias=config.use_attention_bias,
 80 |         attention_dropout=config.attn_pdrop,
 81 |         rope_theta=config.rotary_base,
 82 |         tie_word_embeddings=config.tied_embedding,
 83 |         architectures=["GemmaForCausalLM"],
 84 |     )
 85 | 
 86 | 
 87 | def gemma_config_maker() -> ReaLModelConfig:
 88 |     hf_config = transformers.GemmaConfig(
 89 |         attention_bias=False,
 90 |         hidden_act="gelu",
 91 |         hidden_size=TESTING_MODEL_HIDDEN_SIZE,
 92 |         intermediate_size=TESTING_MODEL_INTERMEDIATE_SIZE,
 93 |         max_position_embeddings=TESTING_MODEL_N_POSITIONS,
 94 |         num_attention_heads=TESTING_MODEL_N_HEADS,
 95 |         num_hidden_layers=TESTING_MODEL_N_LAYERS,
 96 |         num_key_value_heads=4,
 97 |         head_dim=TESTING_MODEL_HEAD_DIM,
 98 |         rms_norm_eps=1e-06,
 99 |         rope_theta=10000.0,
100 |         vocab_size=TESTING_MODEL_VOCAB_SIZE,
101 |     )
102 |     return convert_config_gemma(hf_config)
103 | 
104 | 
105 | register_hf_family(
106 |     name="gemma",
107 |     hf_cls_name="GemmaForCausalLM",
108 |     config_from_hf_converter=convert_config_gemma,
109 |     config_to_hf_converter=convert_config_back_gemma,
110 |     sd_from_hf_converter=convert_state_dict_llama,
111 |     sd_to_hf_converter=to_llama_state_dict,
112 |     embedding_param_names=llama_embedding_layer_names,
113 |     tblock_param_names=llama_transformer_block_param_name,
114 |     head_param_names=llama_output_head_param_name,
115 |     real_config_maker=gemma_config_maker,
116 | )
117 | 


--------------------------------------------------------------------------------
/realhf/api/from_hf/mistral.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | 
 4 | from realhf.api.core.model_api import ReaLModelConfig, register_hf_family
 5 | from realhf.base.testing import (
 6 |     TESTING_MODEL_HEAD_DIM,
 7 |     TESTING_MODEL_HIDDEN_SIZE,
 8 |     TESTING_MODEL_INTERMEDIATE_SIZE,
 9 |     TESTING_MODEL_N_HEADS,
10 |     TESTING_MODEL_N_LAYERS,
11 |     TESTING_MODEL_N_POSITIONS,
12 |     TESTING_MODEL_VOCAB_SIZE,
13 | )
14 | 
15 | from .llama import (
16 |     convert_state_dict_llama,
17 |     llama_embedding_layer_names,
18 |     llama_output_head_param_name,
19 |     llama_transformer_block_param_name,
20 |     to_llama_state_dict,
21 | )
22 | 
23 | 
24 | def config_from_mistral(hf_config: transformers.MistralConfig) -> ReaLModelConfig:
25 |     return ReaLModelConfig(
26 |         n_layers=hf_config.num_hidden_layers,
27 |         vocab_size=hf_config.vocab_size,
28 |         hidden_dim=hf_config.hidden_size,
29 |         n_q_heads=hf_config.num_attention_heads,
30 |         n_kv_heads=hf_config.num_key_value_heads,
31 |         head_dim=hf_config.hidden_size // hf_config.num_attention_heads,
32 |         intermediate_dim=hf_config.intermediate_size,
33 |         activation_function=hf_config.hidden_act,
34 |         n_positions=hf_config.max_position_embeddings,
35 |         layer_norm_epsilon=hf_config.rms_norm_eps,
36 |         layer_norm_type="rms",
37 |         tied_embedding=hf_config.tie_word_embeddings,
38 |         mlp_type="llama",
39 |         rotary_base=hf_config.rope_theta,
40 |         apply_rotary=True,
41 |         attn_pdrop=hf_config.attention_dropout,
42 |         resid_pdrop=0.0,
43 |         use_attention_bias=False,
44 |         use_attn_proj_bias=False,
45 |         embd_pdrop=0.0,
46 |         sliding_window=hf_config.sliding_window,
47 |         scale_attn_by_inverse_layer_idx=False,
48 |     )
49 | 
50 | 
51 | def config_to_mistral(config: ReaLModelConfig) -> transformers.MistralConfig:
52 |     return transformers.MistralConfig(
53 |         num_hidden_layers=config.n_layers,
54 |         vocab_size=config.vocab_size,
55 |         hidden_size=config.hidden_dim,
56 |         num_attention_heads=config.n_q_heads,
57 |         num_key_value_heads=config.n_kv_heads,
58 |         intermediate_size=config.intermediate_dim,
59 |         hidden_act=config.activation_function,
60 |         max_position_embeddings=config.n_positions,
61 |         rms_norm_eps=config.layer_norm_epsilon,
62 |         tie_word_embeddings=False,
63 |         rope_theta=config.rotary_base,
64 |         attention_dropout=config.attn_pdrop,
65 |         sliding_window=config.sliding_window,
66 |         architectures=["MistralForCausalLM"],
67 |     )
68 | 
69 | 
70 | def get_real_config_mistral() -> ReaLModelConfig:
71 |     hf_config = transformers.MistralConfig(
72 |         vocab_size=TESTING_MODEL_VOCAB_SIZE,
73 |         max_position_embeddings=TESTING_MODEL_N_POSITIONS,
74 |         hidden_size=TESTING_MODEL_HIDDEN_SIZE,
75 |         intermediate_size=TESTING_MODEL_INTERMEDIATE_SIZE,
76 |         num_hidden_layers=TESTING_MODEL_N_LAYERS,
77 |         num_attention_heads=TESTING_MODEL_N_HEADS,
78 |         num_key_value_heads=2,
79 |     )
80 |     return config_from_mistral(hf_config)
81 | 
82 | 
83 | register_hf_family(
84 |     "mistral",
85 |     "MistralForCausalLM",
86 |     config_from_hf_converter=config_from_mistral,
87 |     config_to_hf_converter=config_to_mistral,
88 |     sd_from_hf_converter=convert_state_dict_llama,
89 |     sd_to_hf_converter=to_llama_state_dict,
90 |     embedding_param_names=llama_embedding_layer_names,
91 |     tblock_param_names=llama_transformer_block_param_name,
92 |     head_param_names=llama_output_head_param_name,
93 |     real_config_maker=get_real_config_mistral,
94 | )
95 | 


--------------------------------------------------------------------------------
/realhf/api/from_hf/qwen2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import *
  3 | 
  4 | import torch
  5 | import transformers
  6 | 
  7 | from realhf.api.core.model_api import ReaLModelConfig, register_hf_family
  8 | from realhf.base.testing import (
  9 |     TESTING_MODEL_HEAD_DIM,
 10 |     TESTING_MODEL_HIDDEN_SIZE,
 11 |     TESTING_MODEL_INTERMEDIATE_SIZE,
 12 |     TESTING_MODEL_N_HEADS,
 13 |     TESTING_MODEL_N_LAYERS,
 14 |     TESTING_MODEL_N_POSITIONS,
 15 |     TESTING_MODEL_VOCAB_SIZE,
 16 | )
 17 | 
 18 | from .llama import (
 19 |     convert_state_dict_llama,
 20 |     llama_embedding_layer_names,
 21 |     llama_output_head_param_name,
 22 |     llama_transformer_block_param_name,
 23 |     to_llama_state_dict,
 24 | )
 25 | 
 26 | 
 27 | def convert_config_qwen2(
 28 |     hf_config: transformers.Qwen2Config,
 29 | ) -> ReaLModelConfig:
 30 |     return ReaLModelConfig(
 31 |         n_layers=hf_config.num_hidden_layers,
 32 |         n_kv_heads=hf_config.num_key_value_heads,
 33 |         hidden_dim=hf_config.hidden_size,
 34 |         n_q_heads=hf_config.num_attention_heads,
 35 |         head_dim=hf_config.hidden_size // hf_config.num_attention_heads,
 36 |         intermediate_dim=hf_config.intermediate_size,
 37 |         vocab_size=hf_config.vocab_size,
 38 |         n_positions=hf_config.max_position_embeddings,
 39 |         embd_pdrop=0.0,
 40 |         attn_pdrop=(
 41 |             hf_config.attention_dropout
 42 |             if hasattr(hf_config, "attention_dropout")
 43 |             else 0.1
 44 |         ),
 45 |         layer_norm_epsilon=hf_config.rms_norm_eps,
 46 |         activation_function=hf_config.hidden_act,
 47 |         use_attention_bias=True,
 48 |         use_attn_proj_bias=False,
 49 |         scale_attn_by_inverse_layer_idx=False,
 50 |         layer_norm_type="rms",
 51 |         mlp_type="llama",
 52 |         apply_rotary=True,
 53 |         rotary_base=hf_config.rope_theta,
 54 |         rotary_interleaved=False,
 55 |         tied_embedding=hf_config.tie_word_embeddings,
 56 |     )
 57 | 
 58 | 
 59 | def convert_config_back_qwen2(
 60 |     config: ReaLModelConfig,
 61 | ) -> transformers.Qwen2Config:
 62 |     return transformers.Qwen2Config(
 63 |         vocab_size=config.vocab_size,
 64 |         hidden_size=config.hidden_dim,
 65 |         intermediate_size=config.intermediate_dim,
 66 |         num_hidden_layers=config.n_layers,
 67 |         num_key_value_heads=config.n_kv_heads,
 68 |         num_attention_heads=config.n_q_heads,
 69 |         max_position_embeddings=config.n_positions,
 70 |         rms_norm_eps=config.layer_norm_epsilon,
 71 |         hidden_act=config.activation_function,
 72 |         attention_dropout=config.attn_pdrop,
 73 |         rope_theta=config.rotary_base,
 74 |         tie_word_embeddings=config.tied_embedding,
 75 |         architectures=["Qwen2ForCausalLM"],
 76 |     )
 77 | 
 78 | 
 79 | def qwen2_config_maker():
 80 |     hf_config = transformers.Qwen2Config(
 81 |         vocab_size=TESTING_MODEL_VOCAB_SIZE,
 82 |         max_position_embeddings=TESTING_MODEL_N_POSITIONS,
 83 |         hidden_size=TESTING_MODEL_HIDDEN_SIZE,
 84 |         intermediate_size=TESTING_MODEL_INTERMEDIATE_SIZE,
 85 |         num_hidden_layers=TESTING_MODEL_N_LAYERS,
 86 |         num_attention_heads=TESTING_MODEL_N_HEADS,
 87 |         num_key_value_heads=8,
 88 |         hidden_act="silu",
 89 |         rms_norm_eps=1e-5,
 90 |     )
 91 |     return convert_config_qwen2(hf_config)
 92 | 
 93 | 
 94 | register_hf_family(
 95 |     name="qwen2",
 96 |     hf_cls_name="Qwen2ForCausalLM",
 97 |     config_from_hf_converter=convert_config_qwen2,
 98 |     config_to_hf_converter=convert_config_back_qwen2,
 99 |     sd_from_hf_converter=convert_state_dict_llama,
100 |     sd_to_hf_converter=to_llama_state_dict,
101 |     embedding_param_names=llama_embedding_layer_names,
102 |     tblock_param_names=llama_transformer_block_param_name,
103 |     head_param_names=llama_output_head_param_name,
104 |     real_config_maker=qwen2_config_maker,
105 | )
106 | 


--------------------------------------------------------------------------------
/realhf/api/quickstart/__init__.py:
--------------------------------------------------------------------------------
1 | # NOTE: required by hydra
2 | 


--------------------------------------------------------------------------------
/realhf/api/quickstart/dataset.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | 
 3 | 
 4 | @dataclasses.dataclass
 5 | class PromptAnswerDatasetConfig:
 6 |     """Configuration for datasets used in Supervised Fine-Tuning (SFT).
 7 | 
 8 |     The raw data must be in a JSON or JSONL file format, where each entry is a dictionary
 9 |     with the keys `prompt` and `answer`. Both `prompt` and `answer` must be strings.
10 | 
11 |     :param train_path: Path to the training dataset.
12 |     :type train_path: str
13 |     :param valid_path: Path to the validation dataset.
14 |     :type valid_path: str
15 |     :param max_seqlen: Maximum sequence length (prompt + answer). Sequences longer than
16 |         this will be truncated.
17 |     :type max_seqlen: int
18 |     :param train_bs_n_seqs: Number of sequences in each batch during training.
19 |     :type train_bs_n_seqs: int
20 |     :param valid_bs_n_seqs: Number of sequences in each batch during validation.
21 |     :type valid_bs_n_seqs: int
22 |     :param pad_to_max_length: Whether to pad sequences to the maximum length. If True,
23 |         all mini-batches created by the DP balanced partitioning algorithm will have
24 |         the same number of tokens, making MFC time predictable. This option is used
25 |         only for benchmarking purposes.
26 |     :type pad_to_max_length: bool
27 |     """
28 | 
29 |     train_path: str = ""
30 |     valid_path: str = ""
31 |     max_seqlen: int = 1024
32 |     train_bs_n_seqs: int = 256
33 |     valid_bs_n_seqs: int = 256
34 |     pad_to_max_length: bool = False
35 | 
36 | 
37 | @dataclasses.dataclass
38 | class PairedComparisonDatasetConfig:
39 |     """Configuration for datasets used in paired-comparison reward modeling,
40 |     DPO, and SimPO.
41 | 
42 |     The raw data must be in a JSON or JSONL file format, where each entry is a dictionary
43 |     with the keys `prompt`, `pos_answers`, and `neg_answers`. `prompt` is a string, while
44 |     `pos_answers` and `neg_answers` are lists of strings. The lists must have the same length.
45 | 
46 |     The raw dataset may contain multiple answer pairs for each prompt. In each epoch, we will
47 |     randomly sample `max_pairs_per_prompt` answer pairs for each prompt, so the maximum batch
48 |     size (in terms of the number of sequences) per step is `train_bs_n_seqs` multiplied by
49 |     `max_pairs_per_prompt`.
50 | 
51 |     :param train_path: Path to the training dataset.
52 |     :type train_path: str
53 |     :param valid_path: Path to the evaluation dataset.
54 |     :type valid_path: str
55 |     :param max_pairs_per_prompt: Maximum number of answer pairs per prompt.
56 |     :type max_pairs_per_prompt: int
57 |     :param max_seqlen: Maximum sequence length (prompt + answers). Sequences longer than
58 |         this will be truncated.
59 |     :type max_seqlen: int
60 |     :param train_bs_n_seqs: Number of sequences in each batch during training.
61 |     :type train_bs_n_seqs: int
62 |     :param valid_bs_n_seqs: Number of sequences in each batch during validation.
63 |     :type valid_bs_n_seqs: int
64 |     """
65 | 
66 |     train_path: str = ""
67 |     valid_path: str = ""
68 |     max_pairs_per_prompt: int = 2
69 |     max_seqlen: int = 1024
70 |     train_bs_n_seqs: int = 256
71 |     valid_bs_n_seqs: int = 256
72 | 
73 | 
74 | @dataclasses.dataclass
75 | class PromptOnlyDatasetConfig:
76 |     """Configuration for datasets used in PPO RLHF.
77 | 
78 |     The raw data must be in a JSON or JSONL file format, where each entry is a dictionary
79 |     with a single key called `prompt`, which is a string.
80 | 
81 |     :param path: Path to the dataset.
82 |     :type path: str
83 |     :param max_prompt_len: Maximum length of the prompt. Prompts longer than this will
84 |         be truncated.
85 |     :type max_prompt_len: int
86 |     :param train_bs_n_seqs: Number of prompts in each batch.
87 |     :type train_bs_n_seqs: int
88 |     :param pad_to_max_length: Whether to pad prompts to the maximum length. If True,
89 |         all mini-batches created by the DP balanced partitioning algorithm will have
90 |         the same number of tokens, making MFC time predictable. This option is used
91 |         only for benchmarking purposes.
92 |     :type pad_to_max_length: bool
93 |     """
94 | 
95 |     path: str = ""
96 |     max_prompt_len: int = 256
97 |     train_bs_n_seqs: int = 256
98 |     pad_to_max_length: bool = False
99 | 


--------------------------------------------------------------------------------
/realhf/api/quickstart/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import datetime
  3 | import functools
  4 | import inspect
  5 | import json
  6 | import os
  7 | import pickle
  8 | import subprocess
  9 | from typing import Callable, Optional
 10 | 
 11 | import hydra
 12 | import omegaconf
 13 | from hydra.core.config_store import ConfigStore
 14 | from omegaconf import MISSING, OmegaConf
 15 | 
 16 | import realhf.api.core.system_api as system_api
 17 | from realhf.base.constants import LOG_ROOT, MODEL_SAVE_ROOT, QUICKSTART_EXPR_CACHE_PATH
 18 | from realhf.base.ray_utils import check_ray_availability
 19 | from realhf.base.slurm_utils import check_slurm_availability
 20 | 
 21 | 
 22 | def kind_reminder(config_name, logger, args):
 23 |     logger.info(f"Running {config_name} experiment.")
 24 |     logger.info(
 25 |         f"Logs will be dumped to {os.path.join(LOG_ROOT, args.experiment_name, args.trial_name)}"
 26 |     )
 27 |     logger.info(
 28 |         f"Model checkpoints will be saved to {os.path.join(MODEL_SAVE_ROOT, args.experiment_name, args.trial_name)}"
 29 |     )
 30 | 
 31 |     if args.mode == "slurm":
 32 |         slurm_available = check_slurm_availability()
 33 |         if slurm_available:
 34 |             logger.info("Launching experiments with SLURM...")
 35 |         else:
 36 |             logger.warning("Slurm is not available. Using local mode.")
 37 |             args.mode = "local"
 38 |     elif args.mode == "ray":
 39 |         ray_available = check_ray_availability()
 40 |         if ray_available:
 41 |             logger.info("Launching experiments with RAY...")
 42 |         else:
 43 |             logger.warning("Ray is not available. Using local mode.")
 44 |             args.mode = "local"
 45 |     elif args.mode == "local":
 46 |         logger.info("Launching experiments locally.")
 47 |     else:
 48 |         raise ValueError(f"Invalid mode {args.mode}")
 49 | 
 50 | 
 51 | cs = ConfigStore.instance()
 52 | QUICKSTART_CONFIG_CLASSES = {}
 53 | QUICKSTART_USERCODE_PATHS = {}
 54 | QUICKSTART_FN = {}
 55 | 
 56 | 
 57 | def register_quickstart_exp(config_name: str, exp_cls: Callable):
 58 |     usercode_path = os.path.abspath(inspect.getfile(inspect.currentframe().f_back))
 59 | 
 60 |     @hydra.main(version_base=None, config_name=config_name)
 61 |     def run(args):
 62 |         # NOTE: we import logging here to avoid hydra logging overwrite
 63 |         import realhf.base.logging as logging
 64 | 
 65 |         logger = logging.getLogger("quickstart", "colored")
 66 | 
 67 |         exp_name = args.experiment_name
 68 |         if args.trial_name == MISSING:
 69 |             args.trial_name = trial_name = (
 70 |                 f"run{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
 71 |             )
 72 |         else:
 73 |             trial_name = args.trial_name
 74 |         from realhf.apps.main import main_start, main_stop
 75 | 
 76 |         kind_reminder(config_name, logger, args)
 77 | 
 78 |         exp_fn = functools.partial(exp_cls, **args)
 79 | 
 80 |         os.makedirs(os.path.dirname(QUICKSTART_EXPR_CACHE_PATH), exist_ok=True)
 81 |         cache_file = os.path.join(
 82 |             QUICKSTART_EXPR_CACHE_PATH, f"{exp_name}_{trial_name}.json"
 83 |         )
 84 |         with open(cache_file, "w") as f:
 85 |             dict_args = OmegaConf.to_container(args)
 86 |             json.dump(
 87 |                 dict(
 88 |                     args=dict_args,
 89 |                     usercode_path=usercode_path,
 90 |                     config_name=config_name,
 91 |                 ),
 92 |                 f,
 93 |                 indent=4,
 94 |                 ensure_ascii=False,
 95 |             )
 96 | 
 97 |         system_api.register_experiment(exp_name, exp_fn)
 98 | 
 99 |         try:
100 |             main_start(args)
101 |         except Exception as e:
102 |             main_stop(args)
103 |             logger.warning("Exception occurred. Stopping all workers.")
104 |             raise e
105 | 
106 |     cs.store(name=config_name, node=exp_cls)
107 | 
108 |     # assert config_name not in QUICKSTART_CONFIG_CLASSES
109 |     QUICKSTART_CONFIG_CLASSES[config_name] = exp_cls
110 |     # assert config_name not in QUICKSTART_USERCODE_PATHS
111 |     QUICKSTART_USERCODE_PATHS[config_name] = usercode_path
112 |     # assert config_name not in QUICKSTART_FN
113 |     QUICKSTART_FN[config_name] = run
114 |     return run
115 | 


--------------------------------------------------------------------------------
/realhf/api/quickstart/search.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | from typing import List, Optional
 3 | 
 4 | from realhf.api.core.dfg import MFCDef
 5 | from realhf.api.quickstart.device_mesh import DeviceMesh
 6 | from realhf.api.quickstart.model import ParallelismConfig
 7 | 
 8 | 
 9 | @dataclasses.dataclass
10 | class RPCExecution:
11 |     rpc: MFCDef
12 |     device_mesh: DeviceMesh
13 |     parallel_strategy: ParallelismConfig
14 |     time_cost: Optional[int] = None
15 |     mem: Optional[int] = None
16 |     static_mem: Optional[int] = None
17 | 
18 |     def __repr__(self):
19 |         return f"RPCExecution({self.rpc}, {self.device_mesh}, {self.parallel_strategy})"
20 | 
21 |     def __hash__(self):
22 |         return hash(
23 |             (
24 |                 self.rpc.name,
25 |                 self.device_mesh.cluster_mesh,
26 |                 self.device_mesh.device_mesh_name,
27 |                 str(self.parallel_strategy),
28 |             )
29 |         )
30 | 
31 | 
32 | @dataclasses.dataclass
33 | class RPCInstance:
34 |     rpc: MFCDef
35 |     iteration_id: int
36 |     parents: List[MFCDef]
37 |     children: List[MFCDef]
38 | 
39 |     @property
40 |     def name(self):
41 |         return f"{self.rpc.name}:{self.iteration_id}"
42 | 
43 |     def __repr__(self):
44 |         if len(self.parents) == 0 and len(self.children) == 0:
45 |             return f"RPCInstance({self.rpc.name}, {self.iteration_id})"
46 |         else:
47 |             return (
48 |                 f"RPCInstance({self.rpc.name}, {self.iteration_id}, "
49 |                 f"{self.parents}, {self.children})"
50 |             )
51 | 
52 |     def __hash__(self):
53 |         return hash((self.rpc.name, self.iteration_id))
54 | 


--------------------------------------------------------------------------------
/realhf/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/realhf/apps/__init__.py


--------------------------------------------------------------------------------
/realhf/apps/profile_layers.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import itertools
 3 | import time
 4 | 
 5 | import realhf.base.testing as testing
 6 | 
 7 | BATCH_SIZE_RANGE = [1, 2, 4, 8, 16, 32, 64, 128]
 8 | SEQ_LEN_RANGE = [128, 256, 512]
 9 | 
10 | 
11 | def profile_layer_func(
12 |     world_size,
13 |     model_path,
14 |     model_name,
15 |     warm_up_rounds,
16 |     profile_rounds,
17 |     batch_size_range,
18 |     seq_len_range,
19 |     use_sequence_parallel=False,
20 |     use_gradient_checkpointing=False,
21 | ):
22 |     # FIXME: use_sequence_parallel=True and use_gradient_checkpointing=True will cause bugs
23 |     import torch
24 | 
25 |     import realhf.base.constants as constants
26 | 
27 |     testing.init_global_constants(
28 |         1, world_size, 1, sequence_parallel=False, gradient_checkpointing=False
29 |     )
30 |     device = torch.device("cuda")
31 |     with constants.model_scope(testing.MODEL_NAME):
32 |         from realhf.search_engine.layers import make_profile_layers
33 | 
34 |         profile_layers = make_profile_layers(device, model_path, model_name)
35 | 
36 |         st = time.monotonic_ns()
37 |         for i in range(warm_up_rounds + profile_rounds):
38 |             for bs, seq_len in itertools.product(batch_size_range, seq_len_range):
39 |                 profile_layers.fwd_gen(bs, seq_len)
40 |                 profile_layers.fwd_bwd_opt(bs, seq_len)
41 | 
42 |             if i < warm_up_rounds:
43 |                 profile_layers.reset_stats()
44 |         profile_layers.make_dataframe_and_print()
45 |         profile_layers.dump_stats(world_size)
46 |         t = (time.monotonic_ns() - st) / int(1e9)
47 |         print(f"profile world size {world_size} cost {t:4f} seconds")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     st = time.monotonic_ns()
52 |     parser = argparse.ArgumentParser(prog="profile_layers")
53 |     parser.add_argument(
54 |         "--model_path",
55 |         type=str,
56 |         required=True,
57 |     )
58 |     parser.add_argument("--expr_name", type=str, default="profile")
59 |     parser.add_argument("--trial_name", type=str, default="profile")
60 |     parser.add_argument("--model_name", type=str, default="Llama-2-70b")
61 |     parser.add_argument("--warm_up_rounds", type=int, default=1)
62 |     parser.add_argument("--profile_rounds", type=int, default=3)
63 |     # parser.add_argument("--use_sequence_parallel", action="store_true")
64 |     # parser.add_argument("--use_gradient_checkpointing", action="store_true")
65 |     args = parser.parse_args()
66 | 
67 |     world_sizes = [1, 2, 4, 8]
68 | 
69 |     for world_size in world_sizes:
70 |         testing.clear_name_resolve(args.expr_name, args.trial_name)
71 |         mp = testing.LocalMultiProcessTest(
72 |             world_size,
73 |             profile_layer_func,
74 |             world_size,
75 |             args.model_path,
76 |             args.model_name,
77 |             args.warm_up_rounds,
78 |             args.profile_rounds,
79 |             BATCH_SIZE_RANGE,
80 |             SEQ_LEN_RANGE,
81 |             expr_name=args.expr_name,
82 |             trial_name=args.trial_name,
83 |         )
84 |         mp.launch()
85 | 
86 |     t = (time.monotonic_ns() - st) / int(1e9)
87 |     print(f"profile model {args.model_name} time cost {t:4f} seconds")
88 | 


--------------------------------------------------------------------------------
/realhf/apps/quickstart.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | import getpass
 4 | import pathlib
 5 | import re
 6 | import sys
 7 | 
 8 | import hydra
 9 | 
10 | from realhf.api.quickstart.entrypoint import QUICKSTART_FN
11 | from realhf.base.cluster import spec as cluster_spec
12 | from realhf.base.importing import import_module
13 | 
14 | # NOTE: Register all implemented experiments inside ReaL.
15 | import_module(
16 |     str(pathlib.Path(__file__).resolve().parent.parent / "experiments" / "common"),
17 |     re.compile(r".*_exp\.py$"),
18 | )
19 | import realhf.experiments.benchmark.profile_exp
20 | 
21 | 
22 | def main():
23 |     parser = argparse.ArgumentParser(prog="ReaL Quickstart")
24 |     subparsers = parser.add_subparsers(dest="cmd", help="sub-command help")
25 |     subparsers.required = True
26 |     for k, v in QUICKSTART_FN.items():
27 |         subparser = subparsers.add_parser(k)
28 |         subparser.set_defaults(func=v)
29 |     args = parser.parse_known_args()[0]
30 | 
31 |     launch_hydra_task(args.cmd, QUICKSTART_FN[args.cmd])
32 | 
33 | 
34 | def launch_hydra_task(name: str, func: hydra.TaskFunction):
35 |     # Disable hydra logging.
36 |     if not any("hydra/job_logging=disabled" in x for x in sys.argv):
37 |         sys.argv += ["hydra/job_logging=disabled"]
38 | 
39 |     if any("experiment_name=" in x for x in sys.argv):
40 |         experiment_name = next(x for x in sys.argv if "experiment_name=" in x).split(
41 |             "="
42 |         )[1]
43 |         if "_" in experiment_name:
44 |             raise RuntimeError("experiment_name should not contain `_`.")
45 |     else:
46 |         experiment_name = f"quickstart-{name}"
47 |         print(f"Experiment name not manually set. Default to {experiment_name}.")
48 |         sys.argv += [f"experiment_name={experiment_name}"]
49 | 
50 |     if (
51 |         "--multirun" in sys.argv
52 |         or "hydra.mode=MULTIRUN" in sys.argv
53 |         or "-m" in sys.argv
54 |     ):
55 |         raise NotImplementedError("Hydra multi-run is not supported.")
56 |     # non-multirun mode, add trial_name and hydra run dir
57 |     if any("trial_name=" in x for x in sys.argv):
58 |         trial_name = next(x for x in sys.argv if "trial_name=" in x).split("=")[1]
59 |     else:
60 |         trial_name = f"run{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
61 |         sys.argv += [f"trial_name={trial_name}"]
62 |     if "_" in trial_name:
63 |         raise RuntimeError("trial_name should not contain `_`.")
64 |     sys.argv += [
65 |         f"hydra.run.dir={cluster_spec.fileroot}/logs/{getpass.getuser()}/"
66 |         f"{experiment_name}/{trial_name}/hydra-outputs/"
67 |     ]
68 | 
69 |     sys.argv.pop(1)
70 | 
71 |     func()
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/realhf/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openpsi-project/ReaLHF/be75fce9931acb9298270fdda08fdca46b6ee8ba/realhf/base/__init__.py


--------------------------------------------------------------------------------
/realhf/base/asyncio_utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import dataclasses
 3 | import sys
 4 | import threading
 5 | from asyncio.base_events import _run_until_complete_cb
 6 | 
 7 | 
 8 | @dataclasses.dataclass
 9 | class AsyncRunUntilCompleteContext:
10 |     loop: asyncio.BaseEventLoop
11 |     future: asyncio.Future
12 |     new_task: bool
13 | 
14 | 
15 | def setup_run_until_complete(
16 |     loop: asyncio.BaseEventLoop,
17 |     future: asyncio.Future,
18 | ) -> AsyncRunUntilCompleteContext:
19 |     loop._check_closed()
20 |     loop._check_running()
21 | 
22 |     new_task = not asyncio.futures.isfuture(future)
23 |     future = asyncio.tasks.ensure_future(future, loop=loop)
24 |     if new_task:
25 |         # An exception is raised if the future didn't complete, so there
26 |         # is no need to log the "destroy pending task" message
27 |         future._log_destroy_pending = False
28 | 
29 |     future.add_done_callback(_run_until_complete_cb)
30 | 
31 |     # set up run forever
32 |     loop._set_coroutine_origin_tracking(loop._debug)
33 | 
34 |     loop._old_agen_hooks = sys.get_asyncgen_hooks()
35 |     loop._thread_id = threading.get_ident()
36 |     sys.set_asyncgen_hooks(
37 |         firstiter=loop._asyncgen_firstiter_hook,
38 |         finalizer=loop._asyncgen_finalizer_hook,
39 |     )
40 |     asyncio.events._set_running_loop(loop)
41 |     return AsyncRunUntilCompleteContext(loop=loop, future=future, new_task=new_task)
42 | 
43 | 
44 | def teardown_run_util_complete(ctx: AsyncRunUntilCompleteContext):
45 |     ctx.loop._stopping = False
46 |     ctx.loop._thread_id = None
47 |     asyncio.events._set_running_loop(None)
48 |     ctx.loop._set_coroutine_origin_tracking(False)
49 |     # Restore any pre-existing async generator hooks.
50 |     if ctx.loop._old_agen_hooks is not None:
51 |         sys.set_asyncgen_hooks(*ctx.loop._old_agen_hooks)
52 |         ctx.loop._old_agen_hooks = None
53 | 
54 |     ctx.future.remove_done_callback(_run_until_complete_cb)
55 | 
56 |     if not ctx.future.done():
57 |         raise RuntimeError("Event loop stopped before Future completed.")
58 | 
59 | 
60 | def raise_asyncio_exception(
61 |     ctx: AsyncRunUntilCompleteContext, raise_error: bool = True
62 | ):
63 |     if ctx.new_task and ctx.future.done() and not ctx.future.cancelled():
64 |         # The coroutine raised a BaseException. Consume the exception
65 |         # to not log a warning, the caller doesn't have access to the
66 |         # local task.
67 |         ctx.future.exception()
68 | 
69 |     try:
70 |         teardown_run_util_complete(ctx)
71 |     except RuntimeError as e:
72 |         if raise_error:
73 |             raise e
74 | 
75 |     if raise_error:
76 |         raise
77 | 


--------------------------------------------------------------------------------
/realhf/base/cluster.py:
--------------------------------------------------------------------------------
  1 | import getpass
  2 | import json
  3 | import os
  4 | import re
  5 | from typing import Dict, List, Optional, Union
  6 | 
  7 | CLUSTER_SPEC_PATH = os.environ.get("CLUSTER_SPEC_PATH", "")
  8 | 
  9 | 
 10 | def get_user_tmp():
 11 |     user = getpass.getuser()
 12 |     user_tmp = os.path.join("/home", user, ".cache", "realhf")
 13 |     os.makedirs(user_tmp, exist_ok=True)
 14 |     return user_tmp
 15 | 
 16 | 
 17 | class ClusterSpec:
 18 |     def __init__(self):
 19 |         self.__loaded = False
 20 | 
 21 |     def load_spec_from_file(self, file_path: str):
 22 |         try:
 23 |             with open(file_path, "r") as f:
 24 |                 spec: Dict = json.load(f)
 25 |         except FileNotFoundError:
 26 |             if file_path == "":
 27 |                 spec = dict(
 28 |                     cluster_type="local",
 29 |                     cluster_name="local",
 30 |                     fileroot=get_user_tmp(),
 31 |                 )
 32 |             else:
 33 |                 raise FileNotFoundError(f"Cluster spec file not found: {file_path}")
 34 | 
 35 |         self.__cluster_type = spec["cluster_type"]
 36 |         self.__cluster_name = spec["cluster_name"]
 37 |         self.__fileroot = spec["fileroot"]
 38 |         self.__node_type_from_node_name_re = spec.get("node_type_from_node_name", None)
 39 |         self.__gpu_type_from_node_name_re = spec.get("gpu_type_from_node_name", None)
 40 |         self.__default_mount = spec.get("default_mount", None)
 41 |         self.__gpu_image = spec.get("gpu_image", None)
 42 |         self.__cpu_image = spec.get("cpu_image", None)
 43 |         self.__node_name_prefix = spec.get("node_name_prefix", "NODE")
 44 | 
 45 |         self.__loaded = True
 46 | 
 47 |     @property
 48 |     def name(self):
 49 |         assert self.__loaded
 50 |         return self.__cluster_name
 51 | 
 52 |     def node_type_from_node_name(self, node_name: str) -> str:
 53 |         """Mapping nodename to slurm node type, including "g1", "g2", "g8",
 54 |         "a100"."""
 55 |         if self.__cluster_type != "slurm":
 56 |             raise NotImplementedError(
 57 |                 "Only slurm cluster uses node_type_from_node_name."
 58 |             )
 59 |         assert self.__loaded
 60 |         for regex, node_type in self.__node_type_from_node_name_re.items():
 61 |             if re.match(regex, node_name):
 62 |                 return node_type
 63 |         raise NotImplementedError()
 64 | 
 65 |     def gpu_type_from_node_name(self, node_name: str) -> str:
 66 |         """Mapping nodename to slurm GPU type, including "geforce" and
 67 |         "tesla"."""
 68 |         if self.__cluster_type != "slurm":
 69 |             raise NotImplementedError(
 70 |                 "Only slurm cluster uses gpu_type_from_node_name."
 71 |             )
 72 |         assert self.__loaded
 73 |         for regex, gpu_type in self.__gpu_type_from_node_name_re.items():
 74 |             if re.match(regex, node_name):
 75 |                 return gpu_type
 76 |         raise NotImplementedError()
 77 | 
 78 |     @property
 79 |     def fileroot(self) -> str:
 80 |         """Return the root directory of the file system in the cluster.
 81 | 
 82 |         When running experiments, files such as logs, checkpoints,
 83 |         caches will be saved under this directory.
 84 |         """
 85 |         assert self.__loaded
 86 |         return self.__fileroot
 87 | 
 88 |     @property
 89 |     def default_mount(self) -> str:
 90 |         """Directories that should be mounted to container that runs
 91 |         workers."""
 92 |         assert self.__loaded
 93 |         return self.__default_mount
 94 | 
 95 |     @property
 96 |     def gpu_image(self) -> str:
 97 |         """Return the default image for containers of GPU workers."""
 98 |         assert self.__loaded
 99 |         return self.__gpu_image
100 | 
101 |     @property
102 |     def cpu_image(self) -> str:
103 |         """Return the default image for containers of CPU workers."""
104 |         assert self.__loaded
105 |         return self.__cpu_image
106 | 
107 |     @property
108 |     def node_name_prefix(self) -> str:
109 |         """Return the prefix of node names in slurm format."""
110 |         assert self.__loaded
111 |         return self.__node_name_prefix
112 | 
113 | 
114 | def node_name_is_node_type(
115 |     node_name: str, node_type: Optional[Union[List[str], str]] = None
116 | ) -> bool:
117 |     assert spec is not None
118 |     if node_type is None:
119 |         return True
120 |     if not isinstance(node_type, list):
121 |         node_type = [node_type]
122 |     nt_condition = []
123 |     for nt in node_type:
124 |         if nt not in ["g1", "g2", "g8", "a100"]:
125 |             raise ValueError(f"Unknown node type {nt}.")
126 |         else:
127 |             cond = spec.node_type_from_node_name(node_name) == nt
128 |         nt_condition.append(cond)
129 |     return any(nt_condition)
130 | 
131 | 
132 | spec = ClusterSpec()
133 | spec.load_spec_from_file(CLUSTER_SPEC_PATH)
134 | 


--------------------------------------------------------------------------------
/realhf/base/importing.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import importlib.util
 3 | import os
 4 | import re
 5 | import sys
 6 | from pathlib import Path
 7 | 
 8 | from .logging import getLogger
 9 | 
10 | logger = getLogger("importing")
11 | 
12 | 
13 | def import_module(path: str, pattern: re.Pattern):
14 |     dirname = Path(path)
15 |     for x in os.listdir(dirname.absolute()):
16 |         if not pattern.match(x):
17 |             continue
18 |         module_path = os.path.splitext(os.path.join(dirname, x))[0]
19 |         assert "realhf" in module_path
20 |         start_idx = path.rindex("realhf")
21 |         module_path = module_path[start_idx:]
22 |         module_path = "realhf." + module_path.replace(os.sep, ".").replace(
23 |             "realhf.", ""
24 |         )
25 |         # logger.info(f"Automatically importing module {module_path}.")
26 |         importlib.import_module(module_path)
27 | 
28 | 
29 | def import_usercode(module_path: str, module_name: str):
30 |     # Create a module spec
31 |     spec = importlib.util.spec_from_file_location(module_name, module_path)
32 |     # Create a module object
33 |     module = importlib.util.module_from_spec(spec)
34 |     # Add the module to sys.modules
35 |     sys.modules[module_name] = module
36 |     # Execute the module in its own namespace
37 |     spec.loader.exec_module(module)
38 | 


--------------------------------------------------------------------------------
/realhf/base/names.py:
--------------------------------------------------------------------------------
 1 | # This file standardizes the name-resolve names used by different components of the system.
 2 | import getpass
 3 | 
 4 | USER_NAMESPACE = getpass.getuser()
 5 | 
 6 | 
 7 | def registry_root(user):
 8 |     return f"trial_registry/{user}"
 9 | 
10 | 
11 | def trial_registry(experiment_name, trial_name):
12 |     return f"trial_registry/{USER_NAMESPACE}/{experiment_name}/{trial_name}"
13 | 
14 | 
15 | def trial_root(experiment_name, trial_name):
16 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}"
17 | 
18 | 
19 | def worker_status(experiment_name, trial_name, worker_name):
20 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/status/{worker_name}"
21 | 
22 | 
23 | def worker_root(experiment_name, trial_name):
24 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/worker/"
25 | 
26 | 
27 | def worker(experiment_name, trial_name, worker_name):
28 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/worker/{worker_name}"
29 | 
30 | 
31 | def worker_key(experiment_name, trial_name, key):
32 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/worker_key/{key}"
33 | 
34 | 
35 | def request_reply_stream(experiment_name, trial_name, stream_name):
36 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/request_reply_stream/{stream_name}"
37 | 
38 | 
39 | def request_reply_stream_root(experiment_name, trial_name):
40 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/request_reply_stream/"
41 | 
42 | 
43 | def distributed_root(experiment_name, trial_name):
44 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/"
45 | 
46 | 
47 | def distributed_peer(experiment_name, trial_name, model_name):
48 |     return (
49 |         f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/peer/{model_name}"
50 |     )
51 | 
52 | 
53 | def distributed_local_peer(experiment_name, trial_name, host_name, model_name):
54 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/local_peer/{host_name}/{model_name}"
55 | 
56 | 
57 | def distributed_master(experiment_name, trial_name, model_name):
58 |     return f"{USER_NAMESPACE}/{experiment_name}/{trial_name}/distributed/master/{model_name}"
59 | 


--------------------------------------------------------------------------------
/realhf/base/network.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | from contextlib import closing
 3 | 
 4 | 
 5 | def find_free_port():
 6 |     """From, stackoverflow Issue 1365265."""
 7 |     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
 8 |         s.bind(("", 0))
 9 |         s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
10 |         return s.getsockname()[1]
11 | 
12 | 
13 | def gethostname():
14 |     return socket.gethostname()
15 | 
16 | 
17 | def gethostip():
18 |     return socket.gethostbyname(socket.gethostname())
19 | 


--------------------------------------------------------------------------------
/realhf/base/numpy_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def shape_leq(shape1: Tuple, shape2: Tuple) -> bool:
 7 |     assert len(shape1) == len(shape2)
 8 |     return all(x1 <= x2 for x1, x2 in zip(shape1, shape2))
 9 | 
10 | 
11 | def shape_union(*shapes: List[Tuple]) -> Tuple:
12 |     if len(shapes) == 1:
13 |         return shapes[0]
14 |     for s in shapes:
15 |         assert len(s) == len(shapes[0])
16 |     return tuple(max(*dims) for dims in zip(*shapes))
17 | 
18 | 
19 | def split_to_shapes(x: np.ndarray, shapes: Dict, axis: int = -1):
20 |     """Split an array and reshape to desired shapes.
21 | 
22 |     Args:
23 |         x (np.ndarray): The array to be splitted
24 |         shapes (Dict): Dict of shapes (tuples) specifying how to split.
25 |         axis (int): Split dimension.
26 | 
27 |     Returns:
28 |         List: Splitted observations.
29 |     """
30 |     axis = len(x.shape) + axis if axis < 0 else axis
31 |     split_lengths = [np.prod(shape) for shape in shapes.values()]
32 |     assert x.shape[axis] == sum(split_lengths)
33 |     accum_split_lengths = [sum(split_lengths[:i]) for i in range(1, len(split_lengths))]
34 |     splitted_x = np.split(x, accum_split_lengths, axis)
35 |     return {
36 |         k: x.reshape(*x.shape[:axis], *shape, *x.shape[axis + 1 :])
37 |         for x, (k, shape) in zip(splitted_x, shapes.items())
38 |     }
39 | 


--------------------------------------------------------------------------------
/realhf/base/ray_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | 
 5 | def check_ray_availability():
 6 |     return (
 7 |         int(
 8 |             subprocess.run(
 9 |                 ["ray", "--help"],
10 |                 stdout=open(os.devnull, "wb"),
11 |                 stderr=open(os.devnull, "wb"),
12 |             ).returncode
13 |         )
14 |         == 0
15 |     )
16 | 


--------------------------------------------------------------------------------
/realhf/base/recover.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import os
 3 | import pickle
 4 | from typing import Optional, Set
 5 | 
 6 | import realhf.base.constants as constants
 7 | 
 8 | RECOVER_INFO_PATH = None
 9 | 
10 | 
11 | @dataclasses.dataclass
12 | class StepInfo:
13 |     epoch: int
14 |     epoch_step: int
15 |     global_step: int
16 | 
17 | 
18 | @dataclasses.dataclass
19 | class RecoverInfo:
20 |     recover_start: StepInfo
21 |     last_step_info: StepInfo
22 |     hash_vals_to_ignore: Set[int] = dataclasses.field(default_factory=set)
23 | 
24 | 
25 | def dump_recover_info(recover_info: RecoverInfo):
26 |     global RECOVER_INFO_PATH
27 |     if RECOVER_INFO_PATH is None:
28 |         RECOVER_INFO_PATH = os.path.join(
29 |             constants.RECOVER_ROOT,
30 |             constants.experiment_name(),
31 |             constants.trial_name(),
32 |             "recover_info.pkl",
33 |         )
34 |     with open(RECOVER_INFO_PATH, "wb") as f:
35 |         pickle.dump(recover_info, f)
36 | 
37 | 
38 | def load_recover_info() -> Optional[RecoverInfo]:
39 |     global RECOVER_INFO_PATH
40 |     if RECOVER_INFO_PATH is None:
41 |         RECOVER_INFO_PATH = os.path.join(
42 |             constants.RECOVER_ROOT,
43 |             constants.experiment_name(),
44 |             constants.trial_name(),
45 |             "recover_info.pkl",
46 |         )
47 |     try:
48 |         with open(RECOVER_INFO_PATH, "rb") as f:
49 |             return pickle.load(f)
50 |     except FileNotFoundError:
51 |         raise FileNotFoundError(
52 |             f"Resume info not found at {RECOVER_INFO_PATH}. "
53 |             f"This should not be a resumed experiment!"
54 |         )
55 | 


--------------------------------------------------------------------------------
/realhf/base/saveload_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from typing import Dict
 4 | 
 5 | import torch
 6 | import tqdm
 7 | from safetensors import safe_open
 8 | 
 9 | from realhf.base import logging
10 | 
11 | logger = logging.getLogger("SaveLoad")
12 | 
13 | 
14 | def split_state_dict_into_shards(state_dict: Dict, n_shards: int) -> Dict:
15 |     if n_shards == 1:
16 |         return [state_dict]
17 | 
18 |     keys = list(state_dict.keys())
19 |     if len(keys) < n_shards:
20 |         raise ValueError(f"state_dict has {len(keys)} keys, but n_shards={n_shards}")
21 | 
22 |     shard_size = len(keys) // n_shards
23 |     extra = len(keys) % n_shards
24 |     shard_size_list = [shard_size for _ in range(n_shards)]
25 |     shard_size_list[-1] = shard_size + extra
26 |     start, shards = 0, []
27 |     for i, size in enumerate(
28 |         tqdm.tqdm(
29 |             shard_size_list,
30 |             desc=f"Splitting state dict into {len(shard_size_list)} shards...",
31 |         )
32 |     ):
33 |         shard = {}
34 |         for j in range(start, start + size):
35 |             shard[keys[j]] = state_dict[keys[j]]
36 |         start += size
37 |         shards.append(shard)
38 |     return shards
39 | 
40 | 
41 | HF_MODEL_CONFIG_FILES = [
42 |     "config.json",
43 |     "generation_config.json",
44 |     "tokenizer_config.json",
45 |     "vocab.json",
46 |     "merges.txt",
47 |     "special_tokens_map.json",
48 |     "tokenizer.json",
49 | ]
50 | 
51 | 
52 | def copy_hf_configs(src_model_dir, dst_model_dir):
53 |     for file in HF_MODEL_CONFIG_FILES:
54 |         try:
55 |             shutil.copy(
56 |                 os.path.join(src_model_dir, file),
57 |                 os.path.join(dst_model_dir, file),
58 |             )
59 |             logger.info(f"copied {file} from {src_model_dir} to {dst_model_dir}")
60 |         except FileNotFoundError:
61 |             logger.info(f"{file} not exist in {src_model_dir} skipping.")
62 | 
63 | 
64 | def load_safetensor(fn: str) -> Dict[str, torch.Tensor]:
65 |     assert fn.endswith(".safetensors")
66 |     state_dict = {}
67 |     with safe_open(fn, framework="pt", device="cpu") as f:
68 |         for key in f.keys():
69 |             state_dict[key] = f.get_tensor(key)
70 |     return state_dict
71 | 


--------------------------------------------------------------------------------
/realhf/base/security.py:
--------------------------------------------------------------------------------
1 | def read_key(service, name="default"):
2 |     with open(f"/data/marl/keys/{service}/{name}", "r") as f:
3 |         return f.read().strip()
4 | 


--------------------------------------------------------------------------------
/realhf/base/seeding.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | import transformers
 6 | 
 7 | 
 8 | def set_random_seed(seed):
 9 |     transformers.set_seed(seed)
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     if torch.cuda.is_available():
14 |         torch.cuda.manual_seed_all(seed)
15 | 


--------------------------------------------------------------------------------
/realhf/base/slurm_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import subprocess
 4 | from typing import List
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def parse_node_id(node_name: str, prefix: str) -> int:
10 |     return int(node_name.split(prefix)[-1])
11 | 
12 | 
13 | def parse_nodelist(nodelist: str, prefix: str) -> List[str]:
14 |     if not nodelist.startswith(prefix):
15 |         raise ValueError(
16 |             f"Node list `{nodelist}` does not start with hostname prefix `{prefix}`."
17 |         )
18 |     nodelist = nodelist.replace(prefix, "")
19 |     if "[" not in nodelist:
20 |         return [prefix + nodelist]
21 |     else:
22 |         nodelist = nodelist.strip("[]")
23 |         node_ids = []
24 |         nodelist = nodelist.split(",")
25 |         for node_repr in nodelist:
26 |             if "-" not in node_repr:
27 |                 node_ids.append(int(node_repr))
28 |             else:
29 |                 start, end = map(int, node_repr.split("-"))
30 |                 node_ids += list(range(start, end + 1))
31 |         return [f"{prefix}{node_id:02d}" for node_id in node_ids]
32 | 
33 | 
34 | def nodelist_from_nodes(nodes: List[str], prefix: str) -> str:
35 |     node_ids = sorted([parse_node_id(node, prefix) for node in nodes])
36 |     assert len(node_ids) > 0
37 |     if len(node_ids) == 1:
38 |         return f"{prefix}{node_ids[0]:02d}"
39 |     else:
40 |         node_reprs = []
41 |         start, end = node_ids[0], node_ids[0]
42 |         for i in range(len(node_ids)):
43 |             node_id = node_ids[i]
44 |             next_node_id = node_ids[i + 1] if i + 1 < len(node_ids) else -1
45 |             if node_id + 1 == next_node_id:
46 |                 end = next_node_id
47 |             else:
48 |                 if start == end:
49 |                     node_reprs.append(f"{start:02d}")
50 |                 else:
51 |                     node_reprs.append(f"{start:02d}-{end:02d}")
52 |                 start = next_node_id
53 |                 end = next_node_id
54 |         return f"{prefix}[{','.join(node_reprs)}]"
55 | 
56 | 
57 | def are_ones_contiguous(binary_array: np.ndarray):
58 |     one_indices = np.where(binary_array == 1)[0]
59 |     if len(one_indices) == 0:
60 |         return False
61 |     return np.all(np.diff(one_indices) == 1)
62 | 
63 | 
64 | def slurm_hostname_key(hostname):
65 |     """Custom sorting key function to sort Slurm hostnames."""
66 |     # Extract node number from hostname
67 |     match = re.match(r"(\D+)(\d+)", hostname)
68 |     if match:
69 |         prefix, number = match.groups()
70 |         return (prefix, int(number))
71 |     else:
72 |         return (hostname,)
73 | 
74 | 
75 | def check_slurm_availability():
76 | 
77 |     slurm_available = (
78 |         int(
79 |             subprocess.run(
80 |                 "squeue",
81 |                 shell=True,
82 |                 stdout=open(os.devnull, "wb"),
83 |                 stderr=open(os.devnull, "wb"),
84 |             ).returncode
85 |         )
86 |         == 0
87 |     )
88 |     return slurm_available
89 | 


--------------------------------------------------------------------------------
/realhf/experiments/common/check.py:
--------------------------------------------------------------------------------
 1 | import realhf.api.core.model_api as model_api
 2 | 
 3 | 
 4 | def check_is_realhf_native_impl(_cls):
 5 |     return _cls.__module__.startswith("realhf")
 6 | 
 7 | 
 8 | def check_is_realhf_native_model_interface(name):
 9 |     # NOTE: we should not use auto-importing here,
10 |     # because the user may write customized interfaces under this folder.
11 |     import realhf.impl.model.interface.dpo_interface
12 |     import realhf.impl.model.interface.gen_interface
13 |     import realhf.impl.model.interface.ppo_interface
14 |     import realhf.impl.model.interface.rw_interface
15 |     import realhf.impl.model.interface.sft_interface
16 | 
17 |     _cls = model_api.ALL_INTERFACE_CLASSES.get(name)
18 |     return _cls and check_is_realhf_native_impl(_cls)
19 | 


--------------------------------------------------------------------------------
/realhf/experiments/common/rw_exp.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from typing import List, Optional
  3 | 
  4 | from realhf.api.core.config import (
  5 |     DataLoaderAbstraction,
  6 |     DatasetAbstraction,
  7 |     ModelInterfaceAbstraction,
  8 |     ModelInterfaceType,
  9 |     ModelName,
 10 | )
 11 | from realhf.api.core.dfg import MFCDef
 12 | from realhf.api.quickstart.dataset import PairedComparisonDatasetConfig
 13 | from realhf.api.quickstart.device_mesh import MFCConfig
 14 | from realhf.api.quickstart.entrypoint import register_quickstart_exp
 15 | from realhf.api.quickstart.model import ModelTrainEvalConfig
 16 | from realhf.experiments.common.common import CommonExperimentConfig
 17 | 
 18 | 
 19 | @dataclasses.dataclass
 20 | class RWConfig(CommonExperimentConfig):
 21 |     """Configuration for pairwise reward modeling experiments.
 22 | 
 23 |     This class is a subclass of :class:`CommonExperimentConfig`,
 24 |     so all CLI options from the base class are available.
 25 | 
 26 |     :param is_sft_lora: Whether LoRA was used for SFT.
 27 |         If LoRA was used, the saved SFT model should only contain LoRA parameters.
 28 |         Since LoRA is currently not supported for SFT, this option is not utilized at present.
 29 |     :type is_sft_lora: bool
 30 |     :param sft_lora_path: Path to the LoRA model for SFT.
 31 |         Since LoRA is currently not supported for SFT, this option is not utilized at present.
 32 |     :type sft_lora_path: str or None
 33 |     :param model: Configuration for model runtime.
 34 |     :type model: ModelTrainEvalConfig
 35 |     :param allocation: Configuration for device allocation and parallelism.
 36 |     :type allocation: MFCConfig
 37 |     :param dataset: Configuration for the dataset.
 38 |     :type dataset: PairedComparisonDatasetConfig
 39 |     """
 40 | 
 41 |     is_sft_lora: bool = False
 42 |     sft_lora_path: Optional[str] = None
 43 |     model: ModelTrainEvalConfig = dataclasses.field(
 44 |         default_factory=ModelTrainEvalConfig
 45 |     )
 46 |     allocation: MFCConfig = dataclasses.field(default_factory=MFCConfig)
 47 | 
 48 |     dataset: PairedComparisonDatasetConfig = dataclasses.field(
 49 |         default_factory=PairedComparisonDatasetConfig
 50 |     )
 51 | 
 52 |     def __post_init__(self):
 53 |         assert (
 54 |             not self.is_sft_lora and self.sft_lora_path is None
 55 |         ), "LoRA is not supported for now."
 56 |         self.model.init_critic_from_actor = True
 57 | 
 58 |     @property
 59 |     def models(self):
 60 |         return {
 61 |             "default": self.model,
 62 |         }
 63 | 
 64 |     @property
 65 |     def rpcs(self):
 66 |         interface = ModelInterfaceAbstraction("paired_rw")
 67 |         rpc = MFCDef(
 68 |             name="rwTrain",
 69 |             n_mbs=self.allocation.n_mbs,
 70 |             model_name=ModelName("default", 0),
 71 |             interface_type=ModelInterfaceType.TRAIN_STEP,
 72 |             interface_impl=interface,
 73 |             model_type=self.model.type,
 74 |             model_path=self.model.path,
 75 |             input_keys=["packed_input_ids"],
 76 |             log_return_value=True,
 77 |             n_seqs=self.dataset.train_bs_n_seqs,
 78 |         )
 79 |         return {"rwTrain": rpc}
 80 | 
 81 |     @property
 82 |     def allocations(self):
 83 |         return {"rwTrain": self.allocation}
 84 | 
 85 |     @property
 86 |     def datasets(self):
 87 |         return [
 88 |             DatasetAbstraction(
 89 |                 "rw_pair",
 90 |                 args=dict(
 91 |                     max_length=self.dataset.max_seqlen,
 92 |                     max_pairs_per_prompt=self.dataset.max_pairs_per_prompt,
 93 |                     dataset_path=self.dataset.train_path,
 94 |                 ),
 95 |             )
 96 |         ]
 97 | 
 98 |     @property
 99 |     def eval_datasets(self):
100 |         return [
101 |             DatasetAbstraction(
102 |                 "rw_pair",
103 |                 args=dict(
104 |                     max_length=self.dataset.max_seqlen,
105 |                     max_pairs_per_prompt=self.dataset.max_pairs_per_prompt,
106 |                     dataset_path=self.dataset.valid_path,
107 |                 ),
108 |             )
109 |         ]
110 | 
111 |     @property
112 |     def eval_dataloader(self):
113 |         return DataLoaderAbstraction(
114 |             "packed_eval", args=dict(batch_size=self.dataset.valid_bs_n_seqs)
115 |         )
116 | 
117 |     @property
118 |     def tokenizer_name_or_path(self):
119 |         return self.model.path
120 | 
121 | 
122 | register_quickstart_exp("rw", RWConfig)
123 | 


--------------------------------------------------------------------------------
/realhf/experiments/common/sft_exp.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | 
  3 | from realhf.api.core.config import (
  4 |     DataLoaderAbstraction,
  5 |     DatasetAbstraction,
  6 |     ModelInterfaceAbstraction,
  7 |     ModelInterfaceType,
  8 |     ModelName,
  9 | )
 10 | from realhf.api.core.dfg import MFCDef
 11 | from realhf.api.quickstart.dataset import PromptAnswerDatasetConfig
 12 | from realhf.api.quickstart.device_mesh import MFCConfig
 13 | from realhf.api.quickstart.entrypoint import register_quickstart_exp
 14 | from realhf.api.quickstart.model import ModelTrainEvalConfig
 15 | from realhf.experiments.common.common import CommonExperimentConfig
 16 | 
 17 | 
 18 | @dataclasses.dataclass
 19 | class SFTConfig(CommonExperimentConfig):
 20 |     """Configuration for SFT experiments.
 21 | 
 22 |     This class is a subclass of :class:`CommonExperimentConfig`,
 23 |     so all CLI options from the base class are available.
 24 | 
 25 |     :param model: Configuration for model runtime.
 26 |     :type model: ModelTrainEvalConfig
 27 |     :param allocation: Configuration for device allocation and parallelism.
 28 |     :type allocation: MFCConfig
 29 |     :param dataset: Configuration for the dataset.
 30 |     :type dataset: PromptAnswerDatasetConfig
 31 |     """
 32 | 
 33 |     model: ModelTrainEvalConfig = dataclasses.field(
 34 |         default_factory=ModelTrainEvalConfig
 35 |     )
 36 |     allocation: MFCConfig = dataclasses.field(default_factory=MFCConfig)
 37 |     dataset: PromptAnswerDatasetConfig = dataclasses.field(
 38 |         default_factory=PromptAnswerDatasetConfig
 39 |     )
 40 | 
 41 |     @property
 42 |     def models(self):
 43 |         return {
 44 |             "default": self.model,
 45 |         }
 46 | 
 47 |     @property
 48 |     def rpcs(self):
 49 |         rpc = MFCDef(
 50 |             n_seqs=self.dataset.train_bs_n_seqs,
 51 |             name="trainDefault",
 52 |             n_mbs=self.allocation.n_mbs,
 53 |             interface_type=ModelInterfaceType.TRAIN_STEP,
 54 |             interface_impl=ModelInterfaceAbstraction("sft"),
 55 |             model_name="default",
 56 |             input_keys=["packed_input_ids", "prompt_mask"],
 57 |             log_return_value=True,
 58 |             model_type=self.model.type,
 59 |             model_path=self.model.path,
 60 |         )
 61 |         return {"trainDefault": rpc}
 62 | 
 63 |     @property
 64 |     def allocations(self):
 65 |         return {"trainDefault": self.allocation}
 66 | 
 67 |     @property
 68 |     def datasets(self):
 69 |         return [
 70 |             DatasetAbstraction(
 71 |                 "prompt_answer",
 72 |                 args=dict(
 73 |                     max_length=self.dataset.max_seqlen,
 74 |                     dataset_path=self.dataset.train_path,
 75 |                     pad_to_max_length=self.dataset.pad_to_max_length,
 76 |                 ),
 77 |             )
 78 |         ]
 79 | 
 80 |     @property
 81 |     def eval_datasets(self):
 82 |         return [
 83 |             DatasetAbstraction(
 84 |                 "prompt_answer",
 85 |                 args=dict(
 86 |                     max_length=self.dataset.max_seqlen,
 87 |                     dataset_path=self.dataset.valid_path,
 88 |                 ),
 89 |             )
 90 |         ]
 91 | 
 92 |     @property
 93 |     def eval_dataloader(self):
 94 |         return DataLoaderAbstraction(
 95 |             "packed_eval", args=dict(batch_size=self.dataset.valid_bs_n_seqs)
 96 |         )
 97 | 
 98 |     @property
 99 |     def tokenizer_name_or_path(self):
100 |         return self.model.path
101 | 
102 | 
103 | register_quickstart_exp("sft", SFTConfig)
104 | 


--------------------------------------------------------------------------------
/realhf/impl/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | from realhf.base.importing import import_module
 5 | 
 6 | # Import all dataset implementations.
 7 | _p = re.compile(r"^(?!.*__init__).*\.py$")
 8 | _filepath = os.path.dirname(__file__)
 9 | import_module(_filepath, _p)
10 | 


--------------------------------------------------------------------------------
/realhf/impl/dataset/prompt_answer_dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Callable, Dict, List, Optional
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.utils.data
  7 | 
  8 | from realhf.api.core import data_api
  9 | from realhf.base import logging
 10 | 
 11 | logger = logging.getLogger("Prompt Answer Dataset")
 12 | 
 13 | 
 14 | class PromptAnswerDataset(torch.utils.data.Dataset):
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         util: data_api.DatasetUtility,
 19 |         max_length: int,
 20 |         dataset_path: Optional[str] = None,
 21 |         dataset_builder: Optional[Callable[[], List[Dict]]] = None,
 22 |         pad_to_max_length: bool = False,
 23 |     ):
 24 |         """A dataset with prompts and corresponding answers. Usually used for
 25 |         SFT.
 26 | 
 27 |         Args:
 28 |             util (api.data.DatasetUtility): .
 29 |             max_length (Optional[int], optional): The maximum length of each sequence in the batch.
 30 |             dataset_path (Optional[str], optional): Path to the dataset json/jsonl file.
 31 |                 The json/jsonl file should be a list of dictionary. Each element in the list should have
 32 |                 a key "prompt" and a key "answer". Defaults to None.
 33 |             dataset_builder (Optional[Callable[[], List[Dict]]], optional): Alternative to dataset_path.
 34 |                 A callable that returns a list of dictionary. Defaults to None.
 35 |             pad_to_max_length (bool): Whether to pad sequences to the maximum length.
 36 |                 Used only for benchmarking. If True, all mini-batches created by the DP balanced partition
 37 |                 algorithm will have the same number of tokens, making MFC time predictable. Defaults to False.
 38 |         """
 39 |         self._util = util
 40 |         tokenizer = self.util.tokenizer
 41 | 
 42 |         data = data_api.load_shuffle_split_dataset(util, dataset_path, dataset_builder)
 43 | 
 44 |         seqs = [x["prompt"] + x["answer"] + tokenizer.eos_token for x in data]
 45 |         self.ids = [x["id"] for x in data]
 46 |         prompts = [x["prompt"] for x in data]
 47 | 
 48 |         self.tokens = tokenizer(
 49 |             seqs,
 50 |             truncation=True,
 51 |             max_length=max_length,
 52 |             return_length=True,
 53 |             return_attention_mask=False,
 54 |             padding="max_length" if pad_to_max_length else False,
 55 |         )
 56 |         prompt_tokens = tokenizer(
 57 |             prompts,
 58 |             padding=False,
 59 |             truncation=True,
 60 |             return_length=True,
 61 |             max_length=max_length,
 62 |             return_attention_mask=False,
 63 |         )
 64 | 
 65 |         prompt_lengths = prompt_tokens["length"]
 66 |         seq_lengths = self.tokens["length"]
 67 |         prompt_masks = []
 68 |         for i in range(len(self)):
 69 |             prompt_len = prompt_lengths[i]
 70 |             seqlen = self.tokens["length"][i]
 71 |             # seq = self.tokens["input_ids"][i]
 72 |             # prompt = prompt_tokens["input_ids"][i]
 73 |             # assert seq[:prompt_len] == prompt, (seq, prompt, prompt_len, seqlen)
 74 |             assert seqlen >= prompt_len, (seqlen, prompt_len)
 75 |             prompt_mask = [1] * prompt_len + [0] * (seqlen - prompt_len)
 76 |             prompt_masks.append(prompt_mask)
 77 | 
 78 |         self.prompt_masks = prompt_masks
 79 | 
 80 |         logger.info(
 81 |             f"Loaded Prompt Answer Dataset with INFO: "
 82 |             f"#seqs={len(self)}, "
 83 |             f"truncation length={max_length}, "
 84 |             f"avg prompt length={np.mean(prompt_lengths):.1f}, "
 85 |             f"avg answer length={np.mean(seq_lengths) - np.mean(prompt_lengths):.1f}",
 86 |         )
 87 | 
 88 |     @property
 89 |     def util(self):
 90 |         return self._util
 91 | 
 92 |     def __len__(self):
 93 |         return len(self.tokens["input_ids"])
 94 | 
 95 |     def __getitem__(self, idx):
 96 |         d = {
 97 |             "packed_input_ids": torch.tensor(
 98 |                 self.tokens["input_ids"][idx], dtype=torch.long
 99 |             ),
100 |             "prompt_mask": torch.tensor(self.prompt_masks[idx], dtype=torch.bool),
101 |         }
102 |         assert len(d["packed_input_ids"]) == len(d["prompt_mask"])
103 |         seqlen = [len(d["packed_input_ids"])]
104 |         x = data_api.SequenceSample.from_default(
105 |             ids=[self.ids[idx]],
106 |             seqlens=seqlen,
107 |             data=d,
108 |         )
109 |         return x
110 | 
111 | 
112 | data_api.register_dataset("prompt_answer", PromptAnswerDataset)
113 | 


--------------------------------------------------------------------------------
/realhf/impl/dataset/prompt_dataset.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from typing import Callable, Dict, List, Optional
 3 | 
 4 | import torch.utils.data
 5 | 
 6 | from realhf.api.core import data_api
 7 | from realhf.base import logging
 8 | 
 9 | logger = logging.getLogger("Prompt Dataset")
10 | 
11 | 
12 | class PromptDataset(torch.utils.data.Dataset):
13 | 
14 |     def __init__(
15 |         self,
16 |         util: data_api.DatasetUtility,
17 |         max_length: Optional[int] = None,
18 |         dataset_path: Optional[str] = None,
19 |         dataset_builder: Optional[Callable[[], List[Dict]]] = None,
20 |         pad_to_max_length: bool = False,
21 |     ):
22 |         """A dataset with prompts. Usually used for PPO.
23 | 
24 |         Args:
25 |             util (api.data.DatasetUtility): .
26 |             max_length (Optional[int], optional): The maximum length of each sequence in the batch.
27 |             dataset_path (Optional[str], optional): Path to the dataset json/jsonl file.
28 |                 The json/jsonl file should be a list of dictionary. Each element in the list should have
29 |                 a key "prompt". Defaults to None.
30 |             dataset_builder (Optional[Callable[[], List[Dict]]], optional): Alternative to dataset_path.
31 |                 A callable that returns a list of dictionary. Defaults to None.
32 |             pad_to_max_length (bool): Whether to pad prompts to the maximum length.
33 |                 Used only for benchmarking. If True, all mini-batches created by the DP balanced partition
34 |                 algorithm will have the same number of tokens, making MFC time predictable. Defaults to False.
35 |         """
36 |         self._util = util
37 |         self.max_length = max_length
38 | 
39 |         data = data_api.load_shuffle_split_dataset(util, dataset_path, dataset_builder)
40 | 
41 |         prompts_str = [x["prompt"] for x in data]
42 |         self.ids = [x["id"] for x in data]
43 |         util.tokenizer.padding_side = "left"
44 |         prompt_encodings = util.tokenizer(
45 |             prompts_str,
46 |             truncation=True,
47 |             max_length=max_length,
48 |             padding="max_length" if pad_to_max_length else False,
49 |             return_length=True,
50 |             return_attention_mask=False,
51 |         )
52 | 
53 |         self.prompt_lengths = prompt_encodings["length"]
54 |         self.prompts = prompt_encodings["input_ids"]
55 |         assert all(len(x) == l for x, l in zip(self.prompts, self.prompt_lengths))
56 | 
57 |         logger.info(f"Number of prompts in the dataset: {len(self.prompts)}")
58 | 
59 |     @property
60 |     def util(self):
61 |         return self._util
62 | 
63 |     def __len__(self):
64 |         return len(self.prompts)
65 | 
66 |     def __getitem__(self, idx):
67 |         return data_api.SequenceSample.from_default(
68 |             ids=[self.ids[idx]],
69 |             seqlens=[self.prompt_lengths[idx]],
70 |             data=dict(packed_prompts=torch.tensor(self.prompts[idx], dtype=torch.long)),
71 |             metadata=dict(random_id=[uuid.uuid4()]),
72 |         )
73 | 
74 | 
75 | data_api.register_dataset("prompt", PromptDataset)
76 | 


--------------------------------------------------------------------------------
/realhf/impl/model/__init__.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import os
 3 | import re
 4 | 
 5 | import torch
 6 | 
 7 | # Import all HuggingFace model implementations.
 8 | import realhf.api.from_hf
 9 | import realhf.base.logging as logging
10 | from realhf.api.core.model_api import HF_MODEL_FAMILY_REGISTRY
11 | from realhf.base.importing import import_module
12 | from realhf.impl.model.conversion.hf_registry import HFModelRegistry
13 | from realhf.impl.model.nn.real_llm_api import ReaLModel
14 | 
15 | logger = logging.getLogger("model init")
16 | 
17 | # Import all model implementations.
18 | _p = re.compile(r"^(?!.*__init__).*\.py$")
19 | _filepath = os.path.dirname(__file__)
20 | import_module(os.path.join(_filepath, "backend"), _p)
21 | import_module(os.path.join(_filepath, "interface"), _p)
22 | import_module(os.path.join(_filepath, "nn"), _p)
23 | 
24 | # Set PyTorch JIT options, following Megatron-LM.
25 | if torch.cuda.is_available():
26 |     torch._C._jit_set_profiling_executor(True)
27 |     torch._C._jit_set_profiling_mode(True)
28 |     torch._C._jit_override_can_fuse_on_cpu(False)
29 |     torch._C._jit_override_can_fuse_on_gpu(False)
30 |     torch._C._jit_set_texpr_fuser_enabled(False)
31 |     # torch._C._jit_set_nvfuser_enabled(True)  # disable the deprecated warning
32 |     torch._C._debug_set_autodiff_subgraph_inlining(False)
33 | 
34 | # Add HuggingFace hooks to ReaLModel.
35 | _HF_REGISTRIES = {}
36 | 
37 | 
38 | def _load_from_hf(
39 |     model: ReaLModel, registry_name, load_dir: str, init_critic_from_actor: bool
40 | ):
41 |     r = _HF_REGISTRIES[registry_name]
42 |     setattr(
43 |         model,
44 |         "save_to_hf",
45 |         functools.partial(_save_to_hf, model, registry_name),
46 |     )
47 |     return r.load(model, load_dir, init_critic_from_actor)
48 | 
49 | 
50 | def _save_to_hf(model: ReaLModel, registry_name, tokenizer, save_dir: str):
51 |     r = _HF_REGISTRIES[registry_name]
52 |     r.save(model, tokenizer, save_dir)
53 | 
54 | 
55 | def _config_from_hf(registry_name, hf_config=None, model_path=None, is_critic=False):
56 |     r = _HF_REGISTRIES[registry_name]
57 |     return r.config_from_hf(hf_config, model_path, is_critic)
58 | 
59 | 
60 | def _config_to_hf(registry_name, config):
61 |     r = _HF_REGISTRIES[registry_name]
62 |     return r.config_to_hf(config)
63 | 
64 | 
65 | def _make_real_config(registry_name):
66 |     r = _HF_REGISTRIES[registry_name]
67 |     if r.real_config_maker is not None:
68 |         return r.real_config_maker()
69 |     raise NotImplementedError(
70 |         f"`real_config_maker` not implemented for {registry_name}. "
71 |         f"Please implement and register `real_config_maker` "
72 |         f"in realhf.api.from_hf.{registry_name} to make customized ReaLModelConfig."
73 |     )
74 | 
75 | 
76 | for name, helpers in HF_MODEL_FAMILY_REGISTRY.items():
77 |     _HF_REGISTRIES[name] = r = HFModelRegistry(**helpers)
78 | 
79 |     _load_from_hf_ = functools.partialmethod(_load_from_hf, name)
80 |     setattr(ReaLModel, f"from_{name}", _load_from_hf_)
81 | 
82 |     _save_to_hf_ = functools.partialmethod(_save_to_hf, name)
83 |     setattr(ReaLModel, f"to_{name}", _save_to_hf_)
84 | 
85 |     _config_from_hf_ = functools.partial(_config_from_hf, name)
86 |     setattr(ReaLModel, f"config_from_{name}", staticmethod(_config_from_hf_))
87 | 
88 |     _config_to_hf_ = functools.partial(_config_to_hf, name)
89 |     setattr(ReaLModel, f"config_to_{name}", staticmethod(_config_to_hf_))
90 | 
91 |     # make a ReaLModelConfig from only parameters related to model size, used for testing
92 |     _make_real_config_ = functools.partial(_make_real_config, name)
93 |     setattr(ReaLModel, f"make_{name}_config", staticmethod(_make_real_config_))
94 | 


--------------------------------------------------------------------------------
/realhf/impl/model/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .activations import *
2 | from .attn import *
3 | from .embedding import *
4 | from .mlp import *
5 | from .moe import *
6 | from .rms import *
7 | from .rotary import *
8 | 


--------------------------------------------------------------------------------
/realhf/impl/model/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.nn import init
 6 | 
 7 | from realhf.impl.model.parallelism.model_parallel.modules import ParallelEmbedding
 8 | 
 9 | 
10 | class OffsetPositionalEmbedding(nn.Embedding):
11 | 
12 |     def __init__(
13 |         self,
14 |         num_embeddings: int,
15 |         embedding_dim: int,
16 |         offset: int,
17 |         dtype: Optional[torch.dtype] = None,
18 |         device: Optional[Union[str, torch.device]] = None,
19 |     ):
20 |         # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
21 |         # and adjust num_embeddings appropriately. Other models don't have this hack
22 |         self.__offset = offset
23 |         super().__init__(
24 |             num_embeddings + self.__offset,
25 |             embedding_dim,
26 |             dtype=dtype,
27 |             device=device,
28 |         )
29 | 
30 |     def forward(self, position_ids: torch.LongTensor):
31 |         return super().forward(position_ids + self.__offset)
32 | 
33 | 
34 | class OffsetParallelPositionalEmbedding(ParallelEmbedding):
35 |     def __init__(
36 |         self,
37 |         num_embeddings: int,
38 |         embedding_dim: int,
39 |         offset: int,
40 |         init_method=init.xavier_normal_,
41 |         # params_dtype: torch.dtype=torch.float32,
42 |         perform_initialization: bool = True,
43 |         dtype: Optional[torch.dtype] = None,
44 |         device: Optional[Union[str, torch.device]] = None,
45 |     ):
46 |         self.__offset = offset
47 |         super(OffsetParallelPositionalEmbedding, self).__init__(
48 |             num_embeddings=num_embeddings + offset,
49 |             embedding_dim=embedding_dim,
50 |             init_method=init_method,
51 |             perform_initialization=perform_initialization,
52 |             dtype=dtype,
53 |             device=device,
54 |         )
55 | 
56 |     def forward(self, input_: torch.LongTensor) -> torch.Tensor:
57 |         return super().forward(input_ + self.__offset)
58 | 


--------------------------------------------------------------------------------
/realhf/impl/model/modules/moe/__init__.py:
--------------------------------------------------------------------------------
1 | from .experts import *
2 | from .layer import *
3 | from .router import *
4 | from .token_dispatcher import *
5 | 


--------------------------------------------------------------------------------
/realhf/impl/model/modules/moe/layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | # adopted from megatron
 3 | from typing import Optional, Union
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | import realhf.base.constants as constants
 9 | import realhf.base.logging as logging
10 | from realhf.api.core.model_api import ReaLModelConfig
11 | from realhf.impl.model.modules.mlp import GemmaRMSNorm, LlamaRMSNorm
12 | from realhf.impl.model.modules.moe.experts import GroupedMLP, SequentialMLP
13 | from realhf.impl.model.modules.moe.router import TopKRouter
14 | from realhf.impl.model.modules.moe.token_dispatcher import MoETokenDispatcher
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class LayerNormMoELayer(torch.nn.Module):
20 | 
21 |     def __init__(
22 |         self,
23 |         config: ReaLModelConfig,
24 |         layer_idx: int,
25 |         dtype: Optional[torch.dtype] = None,
26 |         device: Optional[Union[str, torch.device]] = None,
27 |     ):
28 |         super(LayerNormMoELayer, self).__init__()
29 | 
30 |         self.config = config
31 |         self.dtype = dtype
32 |         self.device = device
33 |         self.num_experts = self.config.moe.num_experts
34 | 
35 |         if config.layer_norm_type is None:
36 |             layer_norm_fn = nn.LayerNorm
37 |         elif config.layer_norm_type == "rms":
38 |             layer_norm_fn = LlamaRMSNorm
39 |         elif config.layer_norm_type == "gemma":
40 |             layer_norm_fn = GemmaRMSNorm
41 |         self.ln = layer_norm_fn(
42 |             config.hidden_dim, eps=config.layer_norm_epsilon, dtype=dtype, device=device
43 |         )
44 | 
45 |         self.router = TopKRouter(config=self.config, layer_idx=layer_idx)
46 |         self.token_dispatcher = MoETokenDispatcher(config=self.config)
47 |         if config.moe.use_grouped_gemm and dtype == torch.bfloat16:
48 |             self.experts = GroupedMLP(self.config, dtype=dtype, device=device)
49 |         else:
50 |             if config.moe.use_grouped_gemm:
51 |                 logger.warning(
52 |                     "GroupedGemm only supports bfloat16. Fallback to SequentialMLP."
53 |                 )
54 |             self.experts = SequentialMLP(self.config, dtype=dtype, device=device)
55 | 
56 |     def forward(self, hidden_states: torch.Tensor):
57 |         hidden_states = self.ln(hidden_states)
58 |         probs, indices = self.router(hidden_states)
59 |         (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation(
60 |             hidden_states, probs, indices
61 |         )
62 |         expert_output = self.experts(dispatched_input, tokens_per_expert)
63 |         output = self.token_dispatcher.token_unpermutation(
64 |             expert_output,
65 |         )
66 |         return output
67 | 


--------------------------------------------------------------------------------
/realhf/impl/model/parallelism/pipeline_parallel/p2p.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/microsoft/DeepSpeed
 2 | import torch
 3 | import torch.distributed as dist
 4 | from packaging.version import Version
 5 | 
 6 | import realhf.base.constants as constants
 7 | 
 8 | ID_TO_DTYPE = [
 9 |     torch.float32,
10 |     torch.float64,
11 |     torch.complex64,
12 |     torch.complex128,
13 |     torch.float16,
14 |     torch.bfloat16,
15 |     torch.uint8,
16 |     torch.int8,
17 |     torch.int16,
18 |     torch.int32,
19 |     torch.int64,
20 |     torch.bool,
21 | ]
22 | DTYPE_TO_ID = {dtype: id_ for id_, dtype in enumerate(ID_TO_DTYPE)}
23 | 
24 | 
25 | def _tensor_bytes(tensor):
26 |     return tensor.numel() * tensor.element_size()
27 | 
28 | 
29 | def can_send_recv() -> bool:
30 |     # torch_version = Version(torch_info["version"])
31 |     torch_version = Version(torch.__version__)
32 |     sendrecv_min = Version("1.8")
33 |     return torch_version >= sendrecv_min
34 | 
35 | 
36 | assert can_send_recv()
37 | 
38 | 
39 | def _is_valid_send_recv(src_stage, dest_stage):
40 |     first_stage = 0
41 |     last_stage = constants.grid().pipe_parallel_size - 1
42 |     assert (
43 |         abs(src_stage - dest_stage) == 1
44 |         or (src_stage == first_stage and dest_stage == last_stage)
45 |         or (src_stage == last_stage and dest_stage == first_stage)
46 |     ), f"Functionality currently limited to send and receive between adjacent ranks only (src={src_stage}, dst={dest_stage})"
47 | 
48 | 
49 | def send(tensor, dest_stage, async_op=False):
50 |     # NOTE: The input is the stage id rather than the global rank
51 |     src_stage = constants.grid().get_stage_id()
52 |     _is_valid_send_recv(src_stage, dest_stage)
53 | 
54 |     dest_rank = constants.grid().stage_to_global(stage_id=dest_stage)
55 |     send_method = dist.isend if async_op else dist.send
56 |     return send_method(tensor, constants.to_global_pg_rank(dest_rank))
57 | 
58 | 
59 | def recv(tensor, src_stage, async_op=False):
60 |     # NOTE: The input is the stage id rather than the global rank
61 |     dest_stage = constants.grid().get_stage_id()
62 |     _is_valid_send_recv(src_stage, dest_stage)
63 | 
64 |     src_rank = constants.grid().stage_to_global(stage_id=src_stage)
65 |     recv_method = dist.irecv if async_op else dist.recv
66 |     return recv_method(tensor, constants.to_global_pg_rank(src_rank))
67 | 


--------------------------------------------------------------------------------
/realhf/impl/model/parallelism/pipeline_parallel/tensor_storage.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/microsoft/DeepSpeed
 2 | from collections import defaultdict
 3 | from typing import Any, List, Optional, Tuple
 4 | 
 5 | import torch
 6 | 
 7 | import realhf.base.logging as logging
 8 | import realhf.impl.model.parallelism.pipeline_parallel.p2p as p2p
 9 | 
10 | logger = logging.getLogger("tensor_utils")
11 | 
12 | 
13 | def get_shape(tensor):
14 |     return tensor.shape if torch.is_tensor(tensor) else None
15 | 
16 | 
17 | def print_data_shapes(name, rank, mbid, x, ys):
18 |     if rank == 0:
19 |         logger.debug(f"{name}: rank {rank} mbid {mbid}")
20 |         logger.debug(
21 |             f"shapes: x.pp_input {get_shape(x.pp_input)}, x.pp_output {get_shape(x.pp_output)},"
22 |             f" x.cu_seqlens {get_shape(x.cu_seqlens)}"
23 |         )
24 |         for i, y in enumerate(ys):
25 |             logger.debug(
26 |                 f"shapes: ys[{i}].input_ids {get_shape(y.packed_input_ids)}, "
27 |                 f"ys[{i}].k_cache {get_shape(y.k_cache)}, ys[{i}].v_cache {get_shape(y.v_cache)}, "
28 |                 f"ys[{i}].cache_seqlens {get_shape(y.cache_seqlens)}"
29 |             )
30 | 
31 | 
32 | class TensorBuffer:
33 |     # could store both tensors and other data
34 | 
35 |     def __init__(self):
36 |         self.tensors = defaultdict(dict)
37 | 
38 |     def put(self, name: str, mbid: int, x: torch.Tensor):
39 |         self.tensors[name][mbid] = x
40 | 
41 |     def alloc(
42 |         self,
43 |         name: str,
44 |         mbid: int,
45 |         shape: Tuple[int],
46 |         dtype: torch.dtype,
47 |         device: torch.device,
48 |         require_grads: bool = False,
49 |     ):
50 |         self.tensors[name][mbid] = torch.zeros(
51 |             shape, dtype=dtype, device=device, requires_grad=require_grads
52 |         )
53 |         return self.tensors[name][mbid]
54 | 
55 |     def get(
56 |         self,
57 |         name: str,
58 |         mbid: int,
59 |         remove: bool = False,
60 |         raise_error: bool = True,
61 |     ):
62 |         try:
63 |             if remove:
64 |                 return self.tensors[name].pop(mbid)
65 |             else:
66 |                 return self.tensors[name][mbid]
67 |         except KeyError as e:
68 |             if raise_error:
69 |                 raise e
70 |             else:
71 |                 return None
72 | 
73 |     def remove(self, name: str, mbid: Optional[int] = None, check_exists: bool = False):
74 |         try:
75 |             if mbid is None:
76 |                 del self.tensors[name]
77 |             else:
78 |                 self.tensors[name].pop(mbid)
79 |         except KeyError:
80 |             if not check_exists:
81 |                 return
82 |             raise KeyError(f"TensorBuffer.remove: key {name} mbid {mbid} not found")
83 | 
84 |     def check_name(self, name: str):
85 |         return name in self.tensors
86 | 
87 |     def check_mbid(self, name: str, mbid: int):
88 |         if name not in self.tensors:
89 |             return False
90 |         return mbid in self.tensors[name]
91 | 
92 |     def clear(self):
93 |         self.tensors = defaultdict(dict)
94 | 


--------------------------------------------------------------------------------
/realhf/impl/model/utils/dpo_functional.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def dpo_loss(
 8 |     pi_logps: torch.Tensor,
 9 |     ref_logps: torch.Tensor,
10 |     beta: float,
11 | ):
12 |     assert len(pi_logps.shape) == 1 and pi_logps.shape[0] % 2 == 0, (
13 |         pi_logps.shape,
14 |         ref_logps.shape,
15 |     )
16 |     assert len(ref_logps.shape) == 1 and ref_logps.shape[0] % 2 == 0, (
17 |         pi_logps.shape,
18 |         ref_logps.shape,
19 |     )
20 |     pi_logps = pi_logps.view(-1, 2)
21 |     ref_logps = ref_logps.view(-1, 2)
22 |     pi_yw_logps, pi_yl_logps = pi_logps[:, 0], pi_logps[:, 1]
23 |     ref_yw_logps, ref_yl_logps = ref_logps[:, 0], ref_logps[:, 1]
24 |     pi_logratios = pi_yw_logps - pi_yl_logps
25 |     ref_logratios = ref_yw_logps - ref_yl_logps
26 |     losses = -F.logsigmoid(beta * (pi_logratios - ref_logratios)).mean()
27 |     pos_score = beta * (pi_yw_logps - ref_yw_logps).detach().sum()
28 |     neg_score = beta * (pi_yl_logps - ref_yl_logps).detach().sum()
29 |     kl = -(pi_logps - ref_logps).detach().sum()
30 |     return losses, pos_score, neg_score, kl
31 | 


--------------------------------------------------------------------------------
/realhf/search_engine/__init__.py:
--------------------------------------------------------------------------------
1 | def import_profiler_registers():
2 |     import realhf.search_engine.enumerate
3 |     import realhf.search_engine.estimate
4 |     import realhf.search_engine.layers
5 |     import realhf.search_engine.param_realloc
6 |     import realhf.search_engine.search
7 |     import realhf.search_engine.utils
8 | 


--------------------------------------------------------------------------------
/realhf/search_engine/utils.py:
--------------------------------------------------------------------------------
 1 | from realhf.api.core.model_api import ReaLModelConfig
 2 | 
 3 | 
 4 | def find_factors(n):
 5 |     factors = []
 6 |     for i in range(1, n + 1):
 7 |         if n % i == 0:
 8 |             factors.append(i)
 9 |     return factors
10 | 
11 | 
12 | def make_stats_key(rpc_name, bs, seq_len):
13 |     return f"{rpc_name}|{bs}|{seq_len}"
14 | 
15 | 
16 | def parse_stats_key(key):
17 |     rpc_name, bs, seq_len = key.split("|")
18 |     return rpc_name, int(bs), int(seq_len)
19 | 
20 | 
21 | def load_model_config(model_class: str, model_path: str) -> ReaLModelConfig:
22 |     from realhf.impl.model.nn.real_llm_api import ReaLModel
23 | 
24 |     return getattr(ReaLModel, f"config_from_{model_class}")(model_path=model_path)
25 | 


--------------------------------------------------------------------------------
/realhf/system/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | import traceback
 4 | from typing import Type
 5 | 
 6 | import realhf.api.core.system_api
 7 | import realhf.base.logging as logging
 8 | 
 9 | logger = logging.getLogger("system")
10 | 
11 | # NOTE: Workers are configured in the following order.
12 | # Take special care when adding a new worker type.
13 | WORKER_TYPES = ["model_worker", "master_worker"]
14 | 
15 | 
16 | def load_worker(worker_type: str) -> Type:
17 |     assert worker_type in WORKER_TYPES, f"Invalid worker type {worker_type}"
18 |     module = importlib.import_module(worker_type_to_module(worker_type))
19 |     class_name = worker_type_to_class_name(worker_type)
20 |     return getattr(module, class_name)
21 | 
22 | 
23 | def worker_type_to_module(worker_type: str):
24 |     return "realhf.system." + worker_type
25 | 
26 | 
27 | def worker_type_to_class_name(worker_type: str):
28 |     return "".join([w.capitalize() for w in worker_type.split("_")])
29 | 
30 | 
31 | def run_worker(
32 |     worker_type, experiment_name, trial_name, worker_name, worker_server_type
33 | ):
34 |     """Run one worker
35 |     Args:
36 |         worker_type: string, one of the worker types listed above,
37 |         experiment_name: string, the experiment this worker belongs to,
38 |         trial_name: string, the specific trial this worker belongs to,
39 |         worker_name: name given to the worker, typically "<worker_type>/<worker_index>"
40 |         worker_server_type: string, either 'zmq' or 'ray'.
41 |     """
42 |     worker_class = load_worker(worker_type)
43 |     make_server_fn = getattr(
44 |         importlib.import_module("realhf.system.worker_control"), "make_server"
45 |     )
46 |     server = make_server_fn(
47 |         type_=worker_server_type,
48 |         experiment_name=experiment_name,
49 |         trial_name=trial_name,
50 |         worker_name=worker_name,
51 |     )
52 |     worker = worker_class(server=server)
53 |     try:
54 |         worker.run()
55 |     except Exception as e:
56 |         logger.error("Worker %s failed with exception: %s", worker_name, e)
57 |         logger.error(traceback.format_exc())
58 |         raise e
59 | 
60 | 
61 | def make_controller(type_, experiment_name, trial_name):
62 |     module = importlib.import_module("realhf.system.controller")
63 |     if type_ == "zmq":
64 |         control_module = importlib.import_module("realhf.system.worker_control")
65 |         panel = getattr(control_module, "make_control")(
66 |             "zmq", experiment_name, trial_name
67 |         )
68 |         return getattr(module, "Controller")(experiment_name, trial_name, panel)
69 |     elif type_ == "ray":
70 |         return getattr(module, "RayController")(experiment_name, trial_name)
71 |     else:
72 |         raise NotImplementedError(f"Unknown controller type {type_}.")
73 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx-nefertiti
 2 | sphinx
 3 | build>=1.2.1
 4 | wheel>=0.43.0
 5 | distro-info>=1.0
 6 | python-debian>=0.1.49
 7 | huggingface_hub
 8 | datasets
 9 | accelerate
10 | ninja
11 | matplotlib
12 | ipython
13 | megatron_core==0.6.0
14 | deepspeed==0.14.0
15 | h5py
16 | nltk
17 | sentencepiece
18 | wandb
19 | tensorboardx
20 | blosc
21 | colorama
22 | colorlog
23 | einops
24 | hydra-core
25 | matplotlib
26 | numba
27 | omegaconf
28 | packaging
29 | pandas
30 | pybind11>=2.10.0
31 | numpy<2.0.0
32 | psutil
33 | pynvml
34 | pytest
35 | PyYAML
36 | pyzmq
37 | ray
38 | redis
39 | scipy
40 | seaborn
41 | setuptools>=61.0
42 | tqdm
43 | transformers==4.42.3
44 | networkx==3.3
45 | matplotlib
46 | tabulate
47 | aiofiles


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # NOTE: This file is required for importing.
2 | 


--------------------------------------------------------------------------------
/tests/cpp_extensions/test_interval_ops.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | import uuid
  5 | from typing import *
  6 | 
  7 | import numpy as np
  8 | import pytest
  9 | import torch
 10 | 
 11 | from realhf.impl.model.nn.flatten_param import (
 12 |     _set_intervals_py,
 13 |     _slice_intervals_py,
 14 |     set_intervals,
 15 |     slice_intervals,
 16 | )
 17 | 
 18 | 
 19 | def make_intervals(maxsize, n_intervals):
 20 |     assert maxsize // n_intervals > 1
 21 |     s = maxsize // n_intervals
 22 |     intervals = []
 23 |     interval_size = 0
 24 |     max_interval_size = 0
 25 |     for i in range(n_intervals):
 26 |         intervals.append((i * s, i * s + s // 2))
 27 |         interval_size += s // 2
 28 |         max_interval_size = max(max_interval_size, s // 2)
 29 |     np.random.shuffle(intervals)
 30 |     return np.array(intervals, dtype=np.int64), interval_size, max_interval_size
 31 | 
 32 | 
 33 | def maybe_synchronize_cuda():
 34 |     if torch.cuda.is_available():
 35 |         torch.cuda.synchronize()
 36 | 
 37 | 
 38 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires a GPU.")
 39 | @pytest.mark.parametrize(
 40 |     "n_intervals", list(reversed([1, 100, 500, 1000, 2000, 4000, 10000, 100000]))
 41 | )
 42 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32, torch.float16])
 43 | def test_get(n_intervals: int, dtype: torch.dtype):
 44 |     device = torch.device("cuda")
 45 | 
 46 |     input_tensor = torch.randn(int(1e8), device=device, dtype=dtype)
 47 |     intervals, output_size, max_interval_size = make_intervals(
 48 |         input_tensor.size(0), n_intervals
 49 |     )
 50 |     intervals_cuda = torch.tensor(intervals, dtype=torch.long, device="cuda")
 51 | 
 52 |     # warmup
 53 |     slice_intervals(
 54 |         input_tensor,
 55 |         intervals,
 56 |         intervals_cuda=intervals_cuda,
 57 |         output_size=output_size,
 58 |         max_interval_size=max_interval_size,
 59 |     )
 60 |     _slice_intervals_py(input_tensor, intervals)
 61 | 
 62 |     maybe_synchronize_cuda()
 63 |     tik = time.perf_counter()
 64 |     for _ in range(10):
 65 |         output_tensor = slice_intervals(
 66 |             input_tensor,
 67 |             intervals,
 68 |             intervals_cuda=intervals_cuda,
 69 |             output_size=output_size,
 70 |             max_interval_size=max_interval_size,
 71 |         )
 72 |     maybe_synchronize_cuda()
 73 |     t1 = time.perf_counter() - tik
 74 | 
 75 |     maybe_synchronize_cuda()
 76 |     tik = time.perf_counter()
 77 |     for _ in range(10):
 78 |         o2 = _slice_intervals_py(input_tensor, intervals)
 79 |     maybe_synchronize_cuda()
 80 |     t2 = time.perf_counter() - tik
 81 |     assert torch.allclose(output_tensor, o2)
 82 |     print(
 83 |         f"slice_interval, Success! #intervals: {n_intervals} C++ ext time: {t1:.4f}, PyTorch time: {t2:.4f}"
 84 |     )
 85 | 
 86 | 
 87 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires a GPU.")
 88 | @pytest.mark.parametrize(
 89 |     "n_intervals", list(reversed([1, 10, 100, 500, 1000, 1000, 10000, 100000]))
 90 | )
 91 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 92 | @pytest.mark.gpu
 93 | def test_set(n_intervals: int, dtype: torch.dtype):
 94 |     # NOTE: Since the set_intervals degenerate to the python implementation with CPU tensors,
 95 |     # We don't need to test it with CPU tensors.
 96 | 
 97 |     x = torch.randn(int(1e8), device="cuda", dtype=dtype)
 98 |     intervals, interval_size, max_interval_size = make_intervals(x.size(0), n_intervals)
 99 |     intervals_cuda = torch.tensor(intervals, dtype=torch.long, device="cuda")
100 |     src = torch.randn(interval_size, device="cuda", dtype=dtype)
101 | 
102 |     # warmup
103 |     input_tensor1 = x.clone()
104 |     set_intervals(
105 |         src,
106 |         input_tensor1,
107 |         intervals,
108 |         intervals_cuda=intervals_cuda,
109 |         max_interval_size=max_interval_size,
110 |     )
111 |     input_tensor2 = x.clone()
112 |     _set_intervals_py(src, input_tensor2, intervals)
113 | 
114 |     input_tensor1 = x.clone()
115 |     maybe_synchronize_cuda()
116 |     tik = time.perf_counter()
117 |     for _ in range(10):
118 |         set_intervals(
119 |             src,
120 |             input_tensor1,
121 |             intervals,
122 |             intervals_cuda=intervals_cuda,
123 |             max_interval_size=max_interval_size,
124 |         )
125 |     maybe_synchronize_cuda()
126 |     t1 = time.perf_counter() - tik
127 | 
128 |     input_tensor2 = x.clone()
129 |     maybe_synchronize_cuda()
130 |     tik = time.perf_counter()
131 |     for _ in range(10):
132 |         _set_intervals_py(src, input_tensor2, intervals)
133 |     maybe_synchronize_cuda()
134 |     t2 = time.perf_counter() - tik
135 | 
136 |     assert torch.allclose(input_tensor1, input_tensor2)
137 |     print(
138 |         f"set_interval, Success! #intervals: {n_intervals}, C++ ext time: {t1:.4f}, PyTorch time: {t2:.4f}"
139 |     )
140 | 


--------------------------------------------------------------------------------
/tests/model/test_cpu_inference.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from typing import *
  3 | 
  4 | import pytest
  5 | import torch
  6 | import torch.distributed as dist
  7 | import transformers
  8 | 
  9 | from realhf.base import constants, logging, testing
 10 | from realhf.impl.model.nn.real_llm_api import add_helper_functions
 11 | 
 12 | logger = logging.getLogger("tests.test_cpu")
 13 | 
 14 | 
 15 | # NOTE: To run test for a new model class, please implement and register `real_config_maker`
 16 | # in realhf.api.from_hf.<your_model_class_name> and add the model class name to the
 17 | # `model_class` fixture in this file.
 18 | @pytest.fixture(params=["llama", "gpt2", "qwen2", "gemma", "mistral", "mixtral"])
 19 | def model_class(request):
 20 |     return request.param
 21 | 
 22 | 
 23 | def maybe_prepare_cpu_env(max_prompt_len: int):
 24 |     if not dist.is_initialized():
 25 |         # for parametrized runs
 26 |         dist.init_process_group(
 27 |             "gloo", rank=0, world_size=1, init_method="tcp://localhost:7777"
 28 |         )
 29 |         import deepspeed
 30 | 
 31 |         deepspeed.init_distributed()
 32 |         testing.init_global_constants(
 33 |             num_dp=1,
 34 |             num_mp=1,
 35 |             num_pp=1,
 36 |             sequence_parallel=False,
 37 |             max_prompt_len=max_prompt_len,
 38 |         )
 39 |         assert dist.get_world_size() == 1, dist.get_world_size()
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def mconfig(model_class):
 44 |     from realhf.impl.model.nn.real_llm_api import ReaLModel
 45 | 
 46 |     mconfig = getattr(ReaLModel, f"make_{model_class}_config")()
 47 |     return mconfig
 48 | 
 49 | 
 50 | @pytest.fixture
 51 | def save_path(tmpdir_factory: pytest.TempdirFactory):
 52 |     return tmpdir_factory.mktemp("save_path")
 53 | 
 54 | 
 55 | @pytest.fixture
 56 | def cpu_real_model(model_class, mconfig, save_path):
 57 |     max_prompt_len = mconfig.n_positions
 58 |     maybe_prepare_cpu_env(max_prompt_len)
 59 |     with constants.model_scope(testing.MODEL_NAME):
 60 |         from realhf.impl.model.nn.real_llm_api import ReaLModel
 61 | 
 62 |         model = ReaLModel(mconfig, dtype=torch.float32, device="cpu")
 63 |         add_helper_functions(model)
 64 |         model.instantiate()
 65 |         model.eval()
 66 |         getattr(model, f"to_{model_class}")(None, save_path)
 67 |     return model
 68 | 
 69 | 
 70 | @pytest.fixture
 71 | def cpu_hf_model(save_path):
 72 |     hf_model = transformers.AutoModelForCausalLM.from_pretrained(save_path).to(
 73 |         torch.float32
 74 |     )
 75 |     hf_model.eval()
 76 |     return hf_model
 77 | 
 78 | 
 79 | @torch.no_grad()
 80 | def test_inference_cpu_consistency(cpu_real_model, cpu_hf_model, model_class, mconfig):
 81 |     max_prompt_len = mconfig.n_positions
 82 |     with constants.model_scope(testing.MODEL_NAME):
 83 |         bs = 10
 84 |         torch.manual_seed(1)
 85 |         input_ids = torch.randint(
 86 |             0, mconfig.vocab_size, (bs, max_prompt_len), dtype=torch.long
 87 |         )
 88 |         input_lens = torch.full((bs,), max_prompt_len, dtype=torch.int32)
 89 |         attention_mask = torch.arange(max_prompt_len)[None, :] < input_lens[:, None]
 90 | 
 91 |         logits1 = cpu_hf_model(
 92 |             input_ids=input_ids, attention_mask=attention_mask
 93 |         ).logits * attention_mask.unsqueeze(-1)
 94 |         logits2 = cpu_real_model(
 95 |             input_ids=input_ids, attention_mask=attention_mask
 96 |         ).logits * attention_mask.unsqueeze(-1)
 97 | 
 98 |         assert torch.allclose(logits1, logits2, atol=1e-4), (
 99 |             model_class,
100 |             (logits1 - logits2).abs().max(),
101 |         )
102 | 


--------------------------------------------------------------------------------