├── modes
    ├── __init__.py
    ├── speculative_decoding.py
    ├── placeholder.py
    ├── big_model_only.py
    └── small_model_only.py
├── training
    ├── open-r1
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── slow
    │   │   │   └── test_code_reward.py
    │   ├── assets
    │   │   └── plan-of-attack.png
    │   ├── src
    │   │   └── open_r1
    │   │   │   ├── utils
    │   │   │       ├── __init__.py
    │   │   │       ├── ioi
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── utils.py
    │   │   │       ├── wandb_logging.py
    │   │   │       ├── import_utils.py
    │   │   │       └── model_utils.py
    │   │   │   └── __init__.py
    │   ├── .github
    │   │   ├── dependabot.yml
    │   │   └── workflows
    │   │   │   └── tests.yml
    │   ├── recipes
    │   │   ├── accelerate_configs
    │   │   │   ├── ddp.yaml
    │   │   │   ├── zero2.yaml
    │   │   │   ├── zero3.yaml
    │   │   │   └── fsdp.yaml
    │   │   ├── README.md
    │   │   ├── Qwen2.5-1.5B-Instruct
    │   │   │   ├── sft
    │   │   │   │   └── config_demo.yaml
    │   │   │   └── grpo
    │   │   │   │   ├── config_demo.yaml
    │   │   │   │   ├── config_demo_code.yaml
    │   │   │   │   └── config_demo_code_ioi.yaml
    │   │   ├── OlympicCoder-7B
    │   │   │   └── sft
    │   │   │   │   └── config_v00.00.yaml
    │   │   ├── Mistral-Small-24B-Instruct-2501
    │   │   │   └── sft
    │   │   │   │   └── config_openr1_math.yaml
    │   │   ├── OpenR1-Qwen-7B
    │   │   │   └── sft
    │   │   │   │   └── config.yaml
    │   │   ├── SmolLM2-1.7B
    │   │   │   └── sft
    │   │   │   │   └── config.yaml
    │   │   ├── SmolLM2-1.7B-Instruct
    │   │   │   └── sft
    │   │   │   │   └── config.yaml
    │   │   ├── OlympicCoder-32B
    │   │   │   └── sft
    │   │   │   │   └── config_v00.00.yaml
    │   │   ├── Qwen2.5-Math-7B
    │   │   │   └── grpo
    │   │   │   │   └── config_simple_rl.yaml
    │   │   ├── Qwen2.5-7B-Instruct
    │   │   │   └── grpo
    │   │   │   │   └── config_demo.yaml
    │   │   └── DeepSeek-R1-Distill-Qwen-1.5B
    │   │   │   └── sft
    │   │   │       └── config_demo.yaml
    │   ├── slurm
    │   │   ├── piston
    │   │   │   ├── launch_piston_workers.sh
    │   │   │   └── launch_single_piston.sh
    │   │   ├── README.md
    │   │   ├── serve_router.slurm
    │   │   └── train.slurm
    │   ├── setup.cfg
    │   ├── base_training.sh
    │   ├── scripts
    │   │   ├── get_tensor_parallel_size.py
    │   │   ├── upload_details.py
    │   │   └── run_benchmarks.py
    │   ├── Makefile
    │   └── offload_read_graph.py
    └── trl
    │   ├── setup.cfg
    │   ├── requirements.txt
    │   ├── examples
    │       ├── README.md
    │       ├── research_projects
    │       │   ├── stack_llama_2
    │       │   │   └── scripts
    │       │   │   │   ├── requirements.txt
    │       │   │   │   └── README.md
    │       │   ├── toxicity
    │       │   │   └── README.md
    │       │   ├── README.md
    │       │   └── stack_llama
    │       │   │   └── scripts
    │       │   │       ├── README.md
    │       │   │       └── merge_peft_adapter.py
    │       ├── accelerate_configs
    │       │   ├── single_gpu.yaml
    │       │   ├── multi_gpu.yaml
    │       │   ├── deepspeed_zero1.yaml
    │       │   ├── deepspeed_zero2.yaml
    │       │   ├── deepspeed_zero3.yaml
    │       │   └── fsdp_qlora.yaml
    │       ├── cli_configs
    │       │   └── example_config.yaml
    │       ├── notebooks
    │       │   └── README.md
    │       └── scripts
    │       │   ├── dpo.py
    │       │   ├── sft.py
    │       │   └── sft_gemma3.py
    │   ├── docs
    │       └── source
    │       │   ├── unsloth_integration.md
    │       │   ├── liger_kernel_integration.md
    │       │   ├── others.md
    │       │   ├── script_utils.md
    │       │   ├── callbacks.md
    │       │   ├── data_utils.md
    │       │   ├── installation.md
    │       │   ├── models.md
    │       │   ├── iterative_sft_trainer.md
    │       │   ├── deepspeed_integration.md
    │       │   ├── judges.md
    │       │   ├── use_model.md
    │       │   ├── distributing_training.md
    │       │   ├── sentiment_tuning.md
    │       │   ├── best_of_n.md
    │       │   └── _toctree.yml
    │   ├── MANIFEST.in
    │   ├── .github
    │       ├── workflows
    │       │   ├── trufflehog.yml
    │       │   ├── issue_auto_labeller.yml
    │       │   ├── upload_pr_documentation.yml
    │       │   ├── build_documentation.yml
    │       │   ├── build_pr_documentation.yml
    │       │   ├── codeQL.yml
    │       │   ├── clear_cache.yml
    │       │   ├── tests_latest.yml
    │       │   ├── slow-tests.yml
    │       │   └── docker-build.yml
    │       ├── codeql
    │       │   └── custom-queries.qls
    │       ├── ISSUE_TEMPLATE
    │       │   ├── feature-request.yml
    │       │   ├── new-trainer-addition.yml
    │       │   └── bug-report.yml
    │       └── PULL_REQUEST_TEMPLATE.md
    │   ├── .pre-commit-config.yaml
    │   ├── pyproject.toml
    │   ├── tests
    │       ├── __init__.py
    │       ├── slow
    │       │   ├── __init__.py
    │       │   └── testing_constants.py
    │       ├── testing_constants.py
    │       ├── test_core.py
    │       ├── test_rich_progress_callback.py
    │       └── test_modeling_geometric_mixture_wrapper.py
    │   ├── trl
    │       ├── extras
    │       │   ├── __init__.py
    │       │   └── profiling.py
    │       ├── environment
    │       │   └── __init__.py
    │       ├── scripts
    │       │   └── __init__.py
    │       ├── trainer
    │       │   ├── xpo_config.py
    │       │   └── nash_md_config.py
    │       ├── templates
    │       │   └── lm_model_card.md
    │       └── models
    │       │   └── __init__.py
    │   ├── Makefile
    │   ├── CITATION.cff
    │   ├── commands
    │       ├── run_sft.sh
    │       └── run_dpo.sh
    │   ├── docker
    │       ├── trl-latest-gpu
    │       │   └── Dockerfile
    │       └── trl-source-gpu
    │       │   └── Dockerfile
    │   └── .gitignore
├── figs
    ├── image.png
    ├── image_seq_parl.png
    └── accuracy_to_latency_teaser_main.png
├── annotated_dataset
    ├── switch_behavior.pdf
    ├── dataset_analysis.pdf
    ├── normalized_offloading_deepseek.png
    ├── offload_percentage_by_qid_deepseek.png
    ├── strlen_analysis.py
    ├── hf_Dataset
    │   └── proprocess.py
    └── annotation_statistics.csv
├── requirements.txt
├── lm_eval_files
    ├── aime
    │   ├── aime25_nofigures.yaml
    │   ├── aime_figures.yaml
    │   ├── aime_nofigures.yaml
    │   ├── aime24_nofigures.yaml
    │   ├── aime24_figures.yaml
    │   ├── aime_2024_agg8.yaml
    │   ├── aime_2024_rebase.yaml
    │   ├── aime25_nofigures_agg64.yaml
    │   ├── aime24_figures_agg64.yaml
    │   ├── aime24_nofigures_agg64.yaml
    │   ├── aime25_nofigures_maj8cov8.yaml
    │   ├── aime24_nofigures_maj8cov8.yaml
    │   └── README.md
    ├── vllm_speculative_init.py
    ├── openai_math
    │   ├── openai_math.yaml
    │   ├── openai_math_train.yaml
    │   ├── openai_math_cov64.yaml
    │   ├── openai_math_cov64_train.yaml
    │   ├── openai_math_maj64_cov64_train.yaml
    │   ├── openai_math_agg64.yaml
    │   └── openai_math_maj64_cov64.yaml
    └── openai
    │   ├── gpqa_diamond_openai.yaml
    │   └── gpqa_diamond_openai_maj64_cov64.yaml
└── logging_config.json


/modes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/training/open-r1/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/training/trl/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license_file = LICENSE


--------------------------------------------------------------------------------
/training/trl/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets
3 | rich
4 | transformers>=4.46.0


--------------------------------------------------------------------------------
/figs/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/figs/image.png


--------------------------------------------------------------------------------
/figs/image_seq_parl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/figs/image_seq_parl.png


--------------------------------------------------------------------------------
/annotated_dataset/switch_behavior.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/switch_behavior.pdf


--------------------------------------------------------------------------------
/annotated_dataset/dataset_analysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/dataset_analysis.pdf


--------------------------------------------------------------------------------
/figs/accuracy_to_latency_teaser_main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/figs/accuracy_to_latency_teaser_main.png


--------------------------------------------------------------------------------
/training/open-r1/assets/plan-of-attack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/training/open-r1/assets/plan-of-attack.png


--------------------------------------------------------------------------------
/training/trl/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.


--------------------------------------------------------------------------------
/annotated_dataset/normalized_offloading_deepseek.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/normalized_offloading_deepseek.png


--------------------------------------------------------------------------------
/training/trl/examples/research_projects/stack_llama_2/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | trl
3 | peft
4 | accelerate
5 | datasets
6 | bitsandbytes
7 | wandb
8 | 


--------------------------------------------------------------------------------
/annotated_dataset/offload_percentage_by_qid_deepseek.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/offload_percentage_by_qid_deepseek.png


--------------------------------------------------------------------------------
/training/trl/docs/source/unsloth_integration.md:
--------------------------------------------------------------------------------
1 | # Unsloth Integration
2 | 
3 | <Tip warning={true}>
4 | 
5 | Section under construction. Feel free to contribute!
6 | 
7 | </Tip>


--------------------------------------------------------------------------------
/training/trl/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | recursive-exclude * __pycache__
6 | include trl/templates/*.md


--------------------------------------------------------------------------------
/training/trl/docs/source/liger_kernel_integration.md:
--------------------------------------------------------------------------------
1 | # Liger Kernel Integration
2 | 
3 | <Tip warning={true}>
4 | 
5 | Section under construction. Feel free to contribute!
6 | 
7 | </Tip>


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .import_utils import is_e2b_available
2 | from .model_utils import get_tokenizer
3 | 
4 | 
5 | __all__ = ["get_tokenizer", "is_e2b_available"]
6 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/others.md:
--------------------------------------------------------------------------------
 1 | # Other
 2 | 
 3 | ## profiling_decorator
 4 | 
 5 | [[autodoc]] extras.profiling.profiling_decorator
 6 | 
 7 | ## profiling_context
 8 | 
 9 | [[autodoc]] extras.profiling.profiling_context
10 | 


--------------------------------------------------------------------------------
/training/open-r1/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "weekly"
 7 |   - package-ecosystem: "github-actions"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "weekly"
11 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/script_utils.md:
--------------------------------------------------------------------------------
 1 | # Scripts Utilities
 2 | 
 3 | ## ScriptArguments
 4 | 
 5 | [[autodoc]] ScriptArguments
 6 | 
 7 | ## TrlParser
 8 | 
 9 | [[autodoc]] TrlParser
10 |     - parse_args_and_config
11 |     - parse_args_into_dataclasses
12 |     - set_defaults_with_config
13 | 


--------------------------------------------------------------------------------
/training/trl/examples/research_projects/toxicity/README.md:
--------------------------------------------------------------------------------
1 | # De-detoxifying language models
2 | 
3 | To run this code, do the following:
4 | 
5 | ```shell
6 | ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb
7 | ```
8 | 


--------------------------------------------------------------------------------
/training/trl/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 | 
 4 | name: Secret Leaks
 5 | 
 6 | jobs:
 7 |   trufflehog:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - name: Checkout code
11 |       uses: actions/checkout@v4
12 |       with:
13 |         fetch-depth: 0
14 |     - name: Secret Scanning
15 |       uses: trufflesecurity/trufflehog@main
16 | 


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/utils/ioi/__init__.py:
--------------------------------------------------------------------------------
 1 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
 2 | from .scoring import SubtaskResult, score_subtask
 3 | from .utils import add_includes
 4 | 
 5 | 
 6 | __all__ = [
 7 |     "get_piston_client_from_env",
 8 |     "get_slurm_piston_endpoints",
 9 |     "score_subtask",
10 |     "add_includes",
11 |     "SubtaskResult",
12 | ]
13 | 


--------------------------------------------------------------------------------
/training/trl/.github/workflows/issue_auto_labeller.yml:
--------------------------------------------------------------------------------
 1 | name: "Hugging Face Issue Labeler"
 2 | on:
 3 |   issues:
 4 |     types: opened
 5 | 
 6 | jobs:
 7 |   triage:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |       - uses: August-murr/auto-labeler@main
14 |         with:
15 |             hf-api-key: ${{ secrets.CI_HF_API_TOKEN }}
16 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/callbacks.md:
--------------------------------------------------------------------------------
 1 | # Callbacks
 2 | 
 3 | ## SyncRefModelCallback
 4 | 
 5 | [[autodoc]] SyncRefModelCallback
 6 | 
 7 | ## RichProgressCallback
 8 | 
 9 | [[autodoc]] RichProgressCallback
10 | 
11 | ## WinRateCallback
12 | 
13 | [[autodoc]] WinRateCallback
14 | 
15 | ## LogCompletionsCallback
16 | 
17 | [[autodoc]] LogCompletionsCallback
18 | 
19 | ## MergeModelCallback
20 | 
21 | [[autodoc]] MergeModelCallback


--------------------------------------------------------------------------------
/training/open-r1/recipes/accelerate_configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/training/trl/examples/accelerate_configs/single_gpu.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: "NO"
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: 'bf16'
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/training/trl/examples/accelerate_configs/multi_gpu.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: 'bf16'
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/README.md:
--------------------------------------------------------------------------------
 1 | # Post-training recipes
 2 | 
 3 | ## OlympicCoder
 4 | 
 5 | To train the OlympicCoder models, run:
 6 | 
 7 | ```
 8 | # 7B
 9 | sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3
10 | 
11 | # 32B
12 | sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp
13 | ```
14 | 
15 | Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size.


--------------------------------------------------------------------------------
/training/trl/.github/workflows/upload_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Upload PR Documentation
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Build PR Documentation"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   build:
11 |     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 |     with:
13 |       package_name: trl
14 |     secrets:
15 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 |       comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}


--------------------------------------------------------------------------------
/training/trl/examples/cli_configs/example_config.yaml:
--------------------------------------------------------------------------------
 1 | # This is an example configuration file of TRL CLI, you can use it for 
 2 | # SFT like that: `trl sft --config config.yaml --output_dir test-sft`
 3 | # The YAML file supports environment variables by adding an `env` field
 4 | # as below
 5 | 
 6 | # env:
 7 | #   CUDA_VISIBLE_DEVICES: 0
 8 | 
 9 | model_name_or_path:
10 |   Qwen/Qwen2.5-0.5B
11 | dataset_name:
12 |   stanfordnlp/imdb
13 | report_to:
14 |   none
15 | learning_rate:
16 |   0.0001
17 | lr_scheduler_type:
18 |   cosine
19 | 


--------------------------------------------------------------------------------
/training/trl/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - doc-builder*
 8 |       - v*-release
 9 | 
10 | jobs:
11 |    build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.sha }}
15 |       package: trl
16 |       version_tag_suffix: ""
17 |       custom_container: huggingface/transformers-doc-builder
18 |     secrets:
19 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
20 | 


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/utils/wandb_logging.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def init_wandb_training(training_args):
 5 |     """
 6 |     Helper function for setting up Weights & Biases logging tools.
 7 |     """
 8 |     if training_args.wandb_entity is not None:
 9 |         os.environ["WANDB_ENTITY"] = training_args.wandb_entity
10 |     if training_args.wandb_project is not None:
11 |         os.environ["WANDB_PROJECT"] = training_args.wandb_project
12 |     if training_args.wandb_run_group is not None:
13 |         os.environ["WANDB_RUN_GROUP"] = training_args.wandb_run_group
14 | 


--------------------------------------------------------------------------------
/training/trl/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     rev: v0.9.7
 4 |     hooks:
 5 |       - id: ruff
 6 |         types_or: [ python, pyi ]
 7 |         args: [ --fix ]
 8 |       - id: ruff-format
 9 |         types_or: [ python, pyi ]
10 | 
11 |   # - repo: https://github.com/codespell-project/codespell
12 |   #   rev: v2.1.0
13 |   #   hooks:
14 |   #     - id: codespell
15 |   #       args:
16 |   #         - --ignore-words-list=nd,reacher,thist,ths,magent,ba
17 |   #         - --skip=docs/css/termynal.css,docs/js/termynal.js
18 | 


--------------------------------------------------------------------------------
/training/trl/examples/accelerate_configs/deepspeed_zero1.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   gradient_accumulation_steps: 1
 6 |   zero3_init_flag: false
 7 |   zero_stage: 1
 8 | distributed_type: DEEPSPEED
 9 | downcast_bf16: 'no'
10 | machine_rank: 0
11 | main_training_function: main
12 | mixed_precision: 'bf16'
13 | num_machines: 1
14 | num_processes: 8
15 | rdzv_backend: static
16 | same_network: true
17 | tpu_env: []
18 | tpu_use_cluster: false
19 | tpu_use_sudo: false
20 | use_cpu: false
21 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/accelerate_configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/training/trl/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build PR Documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | concurrency:
 7 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.event.pull_request.head.sha }}
15 |       pr_number: ${{ github.event.number }}
16 |       package: trl
17 |       version_tag_suffix: ""
18 |       custom_container: huggingface/transformers-doc-builder
19 | 


--------------------------------------------------------------------------------
/training/trl/examples/accelerate_configs/deepspeed_zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: 'bf16'
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiofiles==24.1.0
 2 | aiohttp==3.11.13
 3 | datasets==3.5.0
 4 | distilabel==1.5.3
 5 | e2b_code_interpreter==1.2.0
 6 | Flask==3.1.0
 7 | hf_transfer==0.1.9
 8 | httpx==0.28.1
 9 | huggingface_hub==0.30.2
10 | latex2sympy2_extended==1.10.1
11 | lighteval==0.8.1
12 | lm_eval==0.4.8
13 | math_verify==0.7.0
14 | matplotlib==3.10.1
15 | more_itertools==10.6.0
16 | numpy
17 | pytablewriter==1.2.1
18 | python-dotenv==1.1.0
19 | Requests==2.32.3
20 | scipy==1.15.2
21 | setuptools==75.8.0
22 | spacy==3.8.5
23 | tabulate==0.9.0
24 | torch==2.6.0
25 | tqdm==4.67.1
26 | transformers==4.51.2
27 | trl==0.16.1
28 | uvloop==0.21.0
29 | vllm==0.8.3


--------------------------------------------------------------------------------
/training/open-r1/recipes/accelerate_configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/training/trl/examples/accelerate_configs/deepspeed_zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/training/trl/examples/research_projects/README.md:
--------------------------------------------------------------------------------
1 | # Research projects that use TRL
2 | 
3 | Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information!
4 | 
5 | - [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity)
6 | - [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama)
7 | - [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2)


--------------------------------------------------------------------------------
/training/trl/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | target-version = "py37"
 3 | line-length = 119
 4 | 
 5 | [tool.ruff.lint]
 6 | ignore = [
 7 |     "B028", # warning without explicit stacklevel
 8 |     "C408", # dict() calls (stylistic)
 9 |     "C901", # function complexity
10 |     "E501",
11 | ]
12 | extend-select = ["E", "F", "I", "W", "UP", "B", "T", "C"]
13 | 
14 | [tool.ruff.lint.per-file-ignores]
15 | # Allow prints in auxiliary scripts
16 | "examples/**.py" = ["T201"]
17 | "scripts/**.py" = ["T201"]
18 | # Ignore import violations in all `__init__.py` files.
19 | "__init__.py" = ["F401"]
20 | 
21 | [tool.ruff.lint.isort]
22 | lines-after-imports = 2
23 | known-first-party = ["trl"]
24 | 


--------------------------------------------------------------------------------
/training/trl/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/training/trl/tests/slow/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/training/open-r1/slurm/piston/launch_piston_workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this simple script will launch a bunch of piston workers on the HF science cluster
 4 | 
 5 | N_INSTANCES=${1:-5}  # Default to 5 instances
 6 | 
 7 | for i in $(seq 1 $N_INSTANCES); do
 8 |     # Find random (hopefully) available port
 9 |     PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1)
10 |     
11 |     # the job name format is important for the code to then be able to get a list of workers. `piston-worker-<port>`
12 |     sbatch \
13 |         --job-name="piston-worker-$PORT" \
14 |         --export=ALL,PORT=$PORT \
15 |         slurm/piston/launch_single_piston.sh
16 | done


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime25_nofigures.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime25_nofigures
 4 | dataset_path: TIGER-Lab/AIME25
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 32768
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |   - metric: extracted_answers
22 |     aggregation: bypass
23 |     higher_is_better: true
24 | metadata:
25 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime_figures.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime_figures
 4 | dataset_path: simplescaling/aime_figures
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 32768
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |   - metric: extracted_answers
22 |     aggregation: bypass
23 |     higher_is_better: true
24 | metadata:
25 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/vllm_speculative_init.py:
--------------------------------------------------------------------------------
 1 | from . import (
 2 |     anthropic_llms,
 3 |     api_models,
 4 |     dummy,
 5 |     gguf,
 6 |     hf_vlms,
 7 |     huggingface,
 8 |     mamba_lm,
 9 |     nemo_lm,
10 |     neuralmagic,
11 |     neuron_optimum,
12 |     openai_completions,
13 |     optimum_lm,
14 |     textsynth,
15 |     vllm_causallms,
16 |     vllm_speculative,
17 |     vllm_vlms,
18 | )
19 | 
20 | 
21 | # TODO: implement __all__
22 | 
23 | 
24 | try:
25 |     # enable hf hub transfer if available
26 |     import hf_transfer  # type: ignore # noqa
27 |     import huggingface_hub.constants  # type: ignore
28 | 
29 |     huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
30 | except ImportError:
31 |     pass
32 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime_nofigures.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime_nofigures
 4 | dataset_path: simplescaling/aime_nofigures
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 32768
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |   - metric: extracted_answers
22 |     aggregation: bypass
23 |     higher_is_better: true
24 | metadata:
25 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime24_nofigures.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime24_nofigures
 4 | dataset_path: simplescaling/aime24_nofigures
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 32768
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |   - metric: extracted_answers
22 |     aggregation: bypass
23 |     higher_is_better: true
24 | metadata:
25 |   version: 1.0


--------------------------------------------------------------------------------
/training/trl/.github/workflows/codeQL.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL Analysis - Workflows"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   analyze:
 8 |     name: "Analyze GitHub Workflows"
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       security-events: write
12 |       actions: read
13 |       contents: read
14 | 
15 |     steps:
16 |       - name: "Checkout repository"
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: "Initialize CodeQL"
20 |         uses: github/codeql-action/init@v2
21 |         with:
22 |           languages: "yaml"
23 |           queries: +security-and-quality, ./.github/codeql/custom-queries.qls
24 | 
25 |       - name: "Perform CodeQL Analysis"
26 |         uses: github/codeql-action/analyze@v2
27 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime24_figures.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime24_figures
 4 | dataset_path: simplescaling/aime24_figures
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 32768
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |   - metric: exact_match_aime24
22 |     aggregation: mean_last30
23 |     higher_is_better: true
24 |   - metric: extracted_answers
25 |     aggregation: bypass
26 |     higher_is_better: true
27 | metadata:
28 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: openai_math
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: test
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until: []
13 |   do_sample: false
14 |   temperature: 0
15 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27  
16 | metric_list:
17 |   - metric: exact_match
18 |     aggregation: mean
19 |     higher_is_better: true
20 |   - metric: extracted_answers
21 |     aggregation: bypass
22 |     higher_is_better: true
23 | metadata:
24 |   version: 1.0


--------------------------------------------------------------------------------
/training/trl/tests/testing_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__"
16 | CI_HUB_USER_FULL_NAME = "Dummy User"
17 | 
18 | CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co"
19 | 


--------------------------------------------------------------------------------
/training/trl/examples/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebooks
2 | 
3 | This directory contains a collection of Jupyter notebooks that demonstrate how to use the TRL library in different applications.
4 | 
5 | - [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.
6 | - [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.
7 | - [`gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
8 | 


--------------------------------------------------------------------------------
/training/open-r1/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - v*-release
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 | 
14 |   tests:
15 |     name: Run tests and quality checks
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v4
20 |       - name: Setup Python environment
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: 3.10.10
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           python -m pip install ".[quality,tests]"
28 |       - name: Code quality
29 |         run: |
30 |           make quality
31 |       - name: Run tests
32 |         run: |
33 |           make test
34 | 
35 | 


--------------------------------------------------------------------------------
/training/open-r1/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math_train.yaml:
--------------------------------------------------------------------------------
 1 | group:
 2 |   - math_word_problems
 3 | task: openai_math_train
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: train
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until:
13 |     - "Problem:"
14 |   skip_special_tokens: false
15 |   do_sample: false
16 |   temperature: 0.0
17 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
18 | metric_list:
19 |   - metric: exact_match
20 |     aggregation: mean
21 |     higher_is_better: true
22 |   - metric: extracted_answers
23 |     aggregation: bypass
24 |     higher_is_better: true
25 | metadata:
26 |   version: 1.0


--------------------------------------------------------------------------------
/training/trl/docs/source/data_utils.md:
--------------------------------------------------------------------------------
 1 | # Data Utilities
 2 | 
 3 | ## is_conversational
 4 | 
 5 | [[autodoc]] is_conversational
 6 | 
 7 | ## apply_chat_template
 8 | 
 9 | [[autodoc]] apply_chat_template
10 | 
11 | ## maybe_apply_chat_template
12 | 
13 | [[autodoc]] maybe_apply_chat_template
14 | 
15 | ## maybe_convert_to_chatml
16 |     
17 | [[autodoc]] maybe_convert_to_chatml
18 | 
19 | ## extract_prompt
20 | 
21 | [[autodoc]] extract_prompt
22 | 
23 | ## maybe_extract_prompt
24 | 
25 | [[autodoc]] maybe_extract_prompt
26 | 
27 | ## unpair_preference_dataset
28 | 
29 | [[autodoc]] unpair_preference_dataset
30 | 
31 | ## maybe_unpair_preference_dataset
32 | 
33 | [[autodoc]] maybe_unpair_preference_dataset
34 | 
35 | ## pack_examples
36 | 
37 | [[autodoc]] pack_examples
38 | 
39 | ## pack_dataset
40 | 
41 | [[autodoc]] pack_dataset
42 | 
43 | ## truncate_dataset
44 | 
45 | [[autodoc]] truncate_dataset
46 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/accelerate_configs/fsdp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | enable_cpu_affinity: false
 6 | fsdp_config:
 7 |   fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
 8 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 9 |   fsdp_backward_prefetch: BACKWARD_PRE
10 |   fsdp_cpu_ram_efficient_loading: true
11 |   fsdp_forward_prefetch: true
12 |   fsdp_offload_params: false
13 |   fsdp_sharding_strategy: FULL_SHARD
14 |   fsdp_state_dict_type: FULL_STATE_DICT
15 |   fsdp_sync_module_states: true
16 |   fsdp_use_orig_params: true
17 | machine_rank: 0
18 | main_training_function: main
19 | mixed_precision: bf16
20 | num_machines: 1
21 | num_processes: 8
22 | rdzv_backend: static
23 | same_network: true
24 | tpu_env: []
25 | tpu_use_cluster: false
26 | tpu_use_sudo: false
27 | use_cpu: false


--------------------------------------------------------------------------------
/training/open-r1/base_training.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ##### TRAINING SFT SCRIPT #####
 4 | 
 5 | ACCELERATE_LOG_LEVEL=info MASTER_PORT=29501 accelerate launch --main_process_port 29502 --config_file recipes/accelerate_configs/zero3.yaml \
 6 |     src/open_r1/sft.py \
 7 |     --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/sft/config_demo.yaml --wandb_project SpeculativeReasoning --run_name DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoning
 8 | 
 9 | ##### GRPO SCRIPT #####
10 | 
11 | CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model akhauriyash/DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoner
12 | 
13 | CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
14 |     accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes 7 \
15 |     src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml --wandb_project SpeculativeReasoning --run_name DeepSeek-R1-Distill-Qwen-1.5B-GRPO-SpeculativeReasoner
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/utils/import_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from transformers.utils.import_utils import _is_package_available
16 | 
17 | 
18 | # Use same as transformers.utils.import_utils
19 | _e2b_available = _is_package_available("e2b")
20 | 
21 | 
22 | def is_e2b_available() -> bool:
23 |     return _e2b_available
24 | 


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math_cov64.yaml:
--------------------------------------------------------------------------------
 1 | group:
 2 |   - math_word_problems
 3 | task: openai_math_cov64
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: test
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until: []
13 |   do_sample: true
14 |   temperature: 0.5
15 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
16 | metric_list:
17 |   - metric: exact_match
18 |     aggregation: mean
19 |     higher_is_better: true
20 | repeats: 64
21 | filter_list:
22 |   - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator.
23 |     filter:
24 |       - function: "take_first_k"
25 |         k: 64
26 | metadata:
27 |   version: 1.0


--------------------------------------------------------------------------------
/logging_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "formatters": {
 4 |     "default": {
 5 |       "class": "vllm.logging_utils.NewLineFormatter",
 6 |       "datefmt": "%m-%d %H:%M:%S",
 7 |       "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 8 |     }
 9 |   },
10 |   "handlers": {
11 |     "console_error": {
12 |       "class": "logging.StreamHandler",
13 |       "formatter": "default",
14 |       "level": "ERROR",
15 |       "stream": "ext://sys.stdout"
16 |     },
17 |     "console_info": {
18 |       "class": "logging.StreamHandler",
19 |       "formatter": "default",
20 |       "level": "INFO",
21 |       "stream": "ext://sys.stdout"
22 |     }
23 |   },
24 |   "loggers": {
25 |     "vllm": {
26 |       "handlers": ["console_error"],
27 |       "level": "ERROR",
28 |       "propagate": false
29 |     },
30 |     "vllm.metrics": {
31 |       "handlers": ["console_info"],
32 |       "level": "INFO",
33 |       "propagate": false
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/training/trl/examples/scripts/dpo.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | ###############################################################################################
16 | # This file has been moved to https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py #
17 | ###############################################################################################
18 | 


--------------------------------------------------------------------------------
/training/trl/examples/scripts/sft.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | ###############################################################################################
16 | # This file has been moved to https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py #
17 | ###############################################################################################
18 | 


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math_cov64_train.yaml:
--------------------------------------------------------------------------------
 1 | group:
 2 |   - math_word_problems
 3 | task: openai_math_cov64_train
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: train
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until:
13 |     - "Problem:"
14 |   do_sample: true
15 |   temperature: 0.5
16 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 | repeats: 64
22 | filter_list:
23 |   - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator.
24 |     filter:
25 |       - function: "take_first_k"
26 |         k: 64
27 | metadata:
28 |   version: 1.0


--------------------------------------------------------------------------------
/training/trl/.github/workflows/clear_cache.yml:
--------------------------------------------------------------------------------
 1 | name: "Cleanup Cache"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "0 0 * * *"
 7 |     
 8 | jobs:
 9 |   cleanup:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out code
13 |         uses: actions/checkout@v4
14 |         
15 |       - name: Cleanup
16 |         run: |
17 |           gh extension install actions/gh-actions-cache
18 |           
19 |           REPO=${{ github.repository }}
20 | 
21 |           echo "Fetching list of cache key"
22 |           cacheKeysForPR=$(gh actions-cache list -R $REPO | cut -f 1 )
23 | 
24 |           ## Setting this to not fail the workflow while deleting cache keys. 
25 |           set +e
26 |           echo "Deleting caches..."
27 |           for cacheKey in $cacheKeysForPR
28 |           do
29 |               gh actions-cache delete $cacheKey -R $REPO --confirm
30 |           done
31 |           echo "Done"
32 |         env:
33 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | You can install TRL either from PyPI or from source:
 3 | 
 4 | ## PyPI
 5 | Install the library with pip or [uv](https://docs.astral.sh/uv/):
 6 | 
 7 | <hfoptions id="install">
 8 | <hfoption id="uv">
 9 | 
10 | uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), .
11 | 
12 | ```bash
13 | uv pip install trl
14 | ```
15 | 
16 | </hfoption>
17 | <hfoption id="pip">
18 | 
19 | ```bash
20 | pip install trl
21 | ```
22 | 
23 | </hfoption>
24 | </hfoptions>
25 | 
26 | ## Source
27 | You can also install the latest version from source. First clone the repo and then run the installation with `pip`:
28 | 
29 | ```bash
30 | git clone https://github.com/huggingface/trl.git
31 | cd trl/
32 | pip install -e .
33 | ```
34 | 
35 | If you want the development install you can replace the pip install with the following:
36 | 
37 | ```bash
38 | pip install -e ".[dev]"
39 | ```
40 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/models.md:
--------------------------------------------------------------------------------
 1 | # Models
 2 | 
 3 | With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder model architectures in transformers such as GPT-2, OPT, and GPT-Neo. In addition, with `AutoModelForSeq2SeqLMWithValueHead` you can use encoder-decoder architectures such as T5. TRL also requires reference models which are frozen copies of the model that is trained. With `create_reference_model` you can easily create a frozen copy and also share layers between the two models to save memory.
 4 | 
 5 | ## PreTrainedModelWrapper
 6 | 
 7 | [[autodoc]] PreTrainedModelWrapper
 8 | 
 9 | ## AutoModelForCausalLMWithValueHead
10 | 
11 | 
12 | [[autodoc]] AutoModelForCausalLMWithValueHead
13 |     - __init__
14 |     - forward
15 |     - generate
16 |     - _init_weights
17 | 
18 | ## AutoModelForSeq2SeqLMWithValueHead
19 | 
20 | [[autodoc]] AutoModelForSeq2SeqLMWithValueHead
21 |     - __init__
22 |     - forward
23 |     - generate
24 |     - _init_weights
25 | 
26 | ## create_reference_model
27 | 
28 | [[autodoc]] create_reference_model


--------------------------------------------------------------------------------
/training/open-r1/slurm/README.md:
--------------------------------------------------------------------------------
 1 | ## Serving DeepSeek-R1 on 2x8 H100 SLURM nodes with SGLang 
 2 | 
 3 | 1. Set up the environment (adjust for your cuda version):
 4 | ```bash
 5 | conda create -n sglang124 python=3.11
 6 | conda activate sglang124
 7 | 
 8 | pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
 9 | 
10 | pip install sgl-kernel --force-reinstall --no-deps
11 | pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
12 | ```
13 | 
14 | 2. Run the server and wait for the model to load:
15 | ```bash
16 | sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
17 | ```
18 | 
19 | 3. Run the data generation script:
20 | ```bash
21 | python scripts/generate_reasoning.py \
22 |     --dataset-name "AI-MO/NuminaMath-1.5" \
23 |     --output-file "numinamath_r1_generations.jsonl" \
24 |     --prompt-column "problem" \
25 |     --uuid-column "problem" \
26 |     --api-addr "<SGLANG_SERVER_ADDRESS>:39877" \
27 |     --num-generations 2 \
28 |     --max-tokens 16384 \
29 |     --max-concurrent 200
30 | ```


--------------------------------------------------------------------------------
/training/trl/examples/accelerate_configs/fsdp_qlora.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE                                                                                                                                           
 2 | debug: false                                                                                                                                                                 
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | fsdp_config:
 6 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 7 |   fsdp_backward_prefetch: BACKWARD_PRE
 8 |   fsdp_cpu_ram_efficient_loading: true
 9 |   fsdp_forward_prefetch: false
10 |   fsdp_offload_params: true
11 |   fsdp_sharding_strategy: FULL_SHARD
12 |   fsdp_state_dict_type: SHARDED_STATE_DICT
13 |   fsdp_sync_module_states: true
14 |   fsdp_use_orig_params: false
15 | machine_rank: 0
16 | main_training_function: main
17 | mixed_precision: 'bf16'
18 | num_machines: 1
19 | num_processes: 8
20 | rdzv_backend: static
21 | same_network: true
22 | tpu_env: []
23 | tpu_use_cluster: false
24 | tpu_use_sudo: false
25 | use_cpu: false


--------------------------------------------------------------------------------
/training/trl/trl/extras/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import TYPE_CHECKING
16 | 
17 | from ..import_utils import _LazyModule
18 | 
19 | 
20 | _import_structure = {
21 |     "best_of_n_sampler": ["BestOfNSampler"],
22 | }
23 | 
24 | if TYPE_CHECKING:
25 |     from .best_of_n_sampler import BestOfNSampler
26 | else:
27 |     import sys
28 | 
29 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
30 | 


--------------------------------------------------------------------------------
/training/trl/.github/codeql/custom-queries.qls:
--------------------------------------------------------------------------------
 1 | import codeql
 2 | 
 3 | from WorkflowString interpolation, Workflow workflow
 4 | where 
 5 |   interpolation.getStringValue().matches("${{ github.event.issue.title }}") or
 6 |   interpolation.getStringValue().matches("${{ github.event.issue.body }}") or
 7 |   interpolation.getStringValue().matches("${{ github.event.pull_request.title }}") or
 8 |   interpolation.getStringValue().matches("${{ github.event.pull_request.body }}") or
 9 |   interpolation.getStringValue().matches("${{ github.event.review.body }}") or
10 |   interpolation.getStringValue().matches("${{ github.event.comment.body }}") or
11 |   interpolation.getStringValue().matches("${{ github.event.inputs.* }}") or
12 |   interpolation.getStringValue().matches("${{ github.event.head_commit.message }}")
13 |   interpolation.getStringValue().matches("${{ github.event.* }}") and
14 |   (
15 |     step.getKey() = "run" or  // Injection in run
16 |     step.getKey() = "env" or  // Injection via env
17 |     step.getKey() = "with"    // Injection via with
18 |   )
19 | select workflow, "🚨 Do not use directly as input of action"
20 | 


--------------------------------------------------------------------------------
/training/trl/trl/environment/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import TYPE_CHECKING
16 | 
17 | from ..import_utils import _LazyModule
18 | 
19 | 
20 | _import_structure = {
21 |     "base_environment": ["TextEnvironment", "TextHistory"],
22 | }
23 | 
24 | if TYPE_CHECKING:
25 |     from .base_environment import TextEnvironment, TextHistory
26 | else:
27 |     import sys
28 | 
29 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
30 | 


--------------------------------------------------------------------------------
/training/trl/trl/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import TYPE_CHECKING
16 | 
17 | from ..import_utils import _LazyModule
18 | 
19 | 
20 | _import_structure = {
21 |     "utils": ["init_zero_verbose", "ScriptArguments", "TrlParser"],
22 | }
23 | 
24 | if TYPE_CHECKING:
25 |     from .utils import ScriptArguments, TrlParser, init_zero_verbose
26 | else:
27 |     import sys
28 | 
29 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
30 | 


--------------------------------------------------------------------------------
/training/trl/tests/slow/testing_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | MODELS_TO_TEST = [
16 |     "trl-internal-testing/tiny-LlamaForCausalLM-3.2",
17 |     "trl-internal-testing/tiny-MistralForCausalLM-0.2",
18 | ]
19 | 
20 | # We could have also not declared these variables but let's be verbose
21 | PACKING_OPTIONS = [True, False]
22 | GRADIENT_CHECKPOINTING_KWARGS = [None, {"use_reentrant": False}, {"use_reentrant": True}]
23 | DEVICE_MAP_OPTIONS = [{"": 0}, "auto"]
24 | 
25 | DPO_LOSS_TYPES = ["sigmoid", "ipo"]
26 | DPO_PRECOMPUTE_LOGITS = [True, False]
27 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-220k
 9 | dataset_num_proc: 48
10 | 
11 | # SFT trainer config
12 | bf16: true
13 | do_eval: false
14 | eval_strategy: 'no'
15 | gradient_accumulation_steps: 1
16 | gradient_checkpointing: true
17 | gradient_checkpointing_kwargs:
18 |   use_reentrant: false
19 | hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
20 | hub_strategy: every_save
21 | learning_rate: 5.0e-05
22 | log_level: info
23 | logging_steps: 5
24 | logging_strategy: steps
25 | lr_scheduler_type: cosine_with_min_lr
26 | lr_scheduler_kwargs:
27 |   min_lr_rate: 0.1
28 | packing: true
29 | max_length: 16384
30 | max_steps: -1
31 | num_train_epochs: 1
32 | output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
33 | overwrite_output_dir: true
34 | per_device_eval_batch_size: 16
35 | per_device_train_batch_size: 16
36 | push_to_hub: true
37 | report_to:
38 | - wandb
39 | save_strategy: "steps"
40 | save_steps: 100
41 | save_total_limit: 1
42 | seed: 42
43 | use_liger: true
44 | warmup_ratio: 0.05


--------------------------------------------------------------------------------
/training/trl/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F680 Feature request"
 2 | description: Submit a proposal/request for a new TRL feature
 3 | labels: [ "Feature request" ]
 4 | body:
 5 |   - type: textarea
 6 |     id: feature-request
 7 |     validations:
 8 |       required: true
 9 |     attributes:
10 |       label: Feature request
11 |       description: |
12 |         A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
13 | 
14 |   - type: textarea
15 |     id: motivation
16 |     validations:
17 |       required: true
18 |     attributes:
19 |       label: Motivation
20 |       description: |
21 |         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
22 | 
23 | 
24 |   - type: textarea
25 |     id: contribution
26 |     validations:
27 |       required: true
28 |     attributes:
29 |       label: Your contribution
30 |       description: |
31 |         Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md)
32 | 


--------------------------------------------------------------------------------
/training/trl/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test precommit common_tests slow_tests test_examples tests_gpu
 2 | 
 3 | check_dirs := examples tests trl
 4 | 
 5 | ACCELERATE_CONFIG_PATH = `pwd`/examples/accelerate_configs
 6 | COMMAND_FILES_PATH = `pwd`/commands
 7 | 
 8 | test:
 9 | 	python -m pytest -n auto --dist=loadfile -s -v --reruns 5 --reruns-delay 1 --only-rerun '(OSError|Timeout|HTTPError.*502|HTTPError.*504||not less than or equal to 0.01)' ./tests/
10 | 
11 | precommit:
12 | 	python scripts/add_copyrights.py
13 | 	pre-commit run --all-files
14 | 
15 | tests_gpu:
16 | 	python -m pytest tests/test_* $(if $(IS_GITHUB_CI),--report-log "common_tests.log",)
17 | 
18 | slow_tests:
19 | 	python -m pytest tests/slow/test_* $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",)
20 | 
21 | test_examples:
22 | 	touch temp_results_sft_tests.txt
23 | 	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
24 | 		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_sft.sh; \
25 | 		echo $$?','$${file} >> temp_results_sft_tests.txt; \
26 | 	done
27 | 
28 | 	touch temp_results_dpo_tests.txt
29 | 	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
30 | 		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_dpo.sh; \
31 | 		echo $$?','$${file} >> temp_results_dpo_tests.txt; \
32 | 	done
33 | 


--------------------------------------------------------------------------------
/training/trl/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | title: 'TRL: Transformer Reinforcement Learning'
 3 | message: >-
 4 |   If you use this software, please cite it using the
 5 |   metadata from this file.
 6 | type: software
 7 | authors:
 8 |   - given-names: Leandro
 9 |     family-names: von Werra
10 |   - given-names: Younes
11 |     family-names: Belkada
12 |   - given-names: Lewis
13 |     family-names: Tunstall
14 |   - given-names: Edward
15 |     family-names: Beeching
16 |   - given-names: Tristan
17 |     family-names: Thrush
18 |   - given-names: Nathan
19 |     family-names: Lambert
20 |   - given-names: Shengyi
21 |     family-names: Huang
22 |   - given-names: Kashif
23 |     family-names: Rasul
24 |   - given-names: Quentin
25 |     family-names: Gallouédec
26 | repository-code: 'https://github.com/huggingface/trl'
27 | abstract: "With trl you can train transformer language models with Proximal Policy Optimization (PPO). The library is built on top of the transformers library by \U0001F917 Hugging Face. Therefore, pre-trained language models can be directly loaded via transformers. At this point, most decoder and encoder-decoder architectures are supported."
28 | keywords:
29 |   - rlhf
30 |   - deep-learning
31 |   - pytorch
32 |   - transformers
33 | license: Apache-2.0
34 | version: 0.16
35 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/OlympicCoder-7B/sft/config_v00.00.yaml:
--------------------------------------------------------------------------------
 1 | # Config for 1 node of 8 H100s with DeepSpeed ZeRO-3
 2 | # Model arguments
 3 | model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: flash_attention_2
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/codeforces-cots
10 | dataset_config: solutions_decontaminated
11 | dataset_num_proc: 48
12 | 
13 | # SFT trainer config
14 | bf16: true
15 | do_eval: false
16 | eval_strategy: 'no'
17 | gradient_accumulation_steps: 8
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: open-r1/OlympicCoder-7B
22 | hub_strategy: every_save
23 | learning_rate: 1.0e-05
24 | log_level: info
25 | logging_steps: 1
26 | logging_strategy: steps
27 | lr_scheduler_type: cosine_with_min_lr
28 | lr_scheduler_kwargs:
29 |   min_lr_rate: 0.1
30 | packing: false
31 | max_grad_norm: 0.2
32 | max_length: 32768
33 | max_steps: -1
34 | num_train_epochs: 10
35 | output_dir: data/OlympicCoder-7B
36 | overwrite_output_dir: true
37 | per_device_eval_batch_size: 1
38 | per_device_train_batch_size: 2
39 | push_to_hub: true
40 | report_to:
41 | - wandb
42 | save_strategy: epoch
43 | save_total_limit: 1
44 | seed: 42
45 | use_liger: true
46 | warmup_ratio: 0.03


--------------------------------------------------------------------------------
/training/open-r1/scripts/get_tensor_parallel_size.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoConfig
 3 | from math import gcd
 4 | 
 5 | def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int:
 6 |     try:
 7 |         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
 8 |         num_heads = getattr(config, 'num_attention_heads', None)
 9 | 
10 |         if num_heads is not None and num_heads % default_tp != 0:
11 |             tp = gcd(num_heads, default_tp)
12 |             return max(tp, 1)
13 |         else:
14 |             return default_tp
15 |     except Exception as e:
16 |         print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}")
17 |         return default_tp
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path")
22 |     parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable")
23 |     parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)")
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp)
28 |     print(tp)
29 | 


--------------------------------------------------------------------------------
/annotated_dataset/strlen_analysis.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from datasets import load_dataset
 3 | 
 4 | # Load the dataset
 5 | ds = load_dataset("open-r1/OpenR1-Math-220k", "default")
 6 | train_ds = ds['train']
 7 | 
 8 | # Collect data
 9 | generation_lengths = []
10 | generation_ids = []
11 | from tqdm import tqdm
12 | gen_id = 0
13 | for item in tqdm(train_ds):
14 |     for gen in item['generations']:
15 |         generation_lengths.append(len(gen))
16 |         generation_ids.append(gen_id)
17 |         gen_id += 1
18 | 
19 | # Plot: Generation ID vs. String Length
20 | plt.figure(figsize=(10, 6), dpi=600)
21 | plt.plot(generation_ids, generation_lengths, linewidth=0.5)
22 | plt.xlabel("Generation ID")
23 | plt.ylabel("String Length")
24 | plt.title("Generation ID vs. String Length")
25 | plt.tight_layout()
26 | plt.savefig("genid_vs_strlen.png")
27 | plt.close()
28 | 
29 | # Plot: Histogram of String Lengths (bin size 500)
30 | plt.figure(figsize=(10, 6), dpi=600)
31 | bins = range(0, max(generation_lengths) + 500, 500)
32 | plt.hist(generation_lengths, bins=bins, edgecolor='black')
33 | plt.xlabel("String Length")
34 | plt.ylabel("Frequency")
35 | plt.title("Histogram of Generation String Lengths")
36 | plt.tight_layout()
37 | plt.savefig("strlen_hist.png")
38 | plt.close()
39 | 
40 | print("Plots saved as 'genid_vs_strlen.png' and 'strlen_hist.png'")
41 | 


--------------------------------------------------------------------------------
/training/trl/.github/ISSUE_TEMPLATE/new-trainer-addition.yml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F31F New trainer addition"
 2 | description: Submit a proposal/request to implement a new trainer for a post-training method 
 3 | labels: [ "New trainer" ]
 4 | 
 5 | body:
 6 |   - type: textarea
 7 |     id: description-request
 8 |     validations:
 9 |       required: true
10 |     attributes:
11 |       label: Method description
12 |       description: |
13 |         Put any and all important information relative to the method
14 | 
15 |   - type: checkboxes
16 |     id: information-tasks
17 |     attributes:
18 |       label: Open source status
19 |       description: |
20 |           Please note that if the method implementation isn't available or model weights with training datasets aren't available, we are less likely to implement it in `trl`.
21 |       options:
22 |         - label: "The method implementation is available"
23 |         - label: "The model weights are available"
24 |         - label: "The training datasets are available"
25 | 
26 |   - type: textarea
27 |     id: additional-info
28 |     attributes:
29 |       label: Provide useful links for the implementation
30 |       description: |
31 |         Please provide information regarding the implementation, the weights, and the authors.
32 |         Please mention the authors by @gh-username if you're aware of their usernames.
33 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml:
--------------------------------------------------------------------------------
 1 | # To start the training, run the following command:
 2 | # sbatch -N 4 --job-name=mistral_sft slurm/train.slurm Mistral-Small-24B-Instruct-2501 sft numina zero3
 3 | 
 4 | model_name_or_path: mistralai/Mistral-Small-24B-Instruct-2501
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | # dataset_name: yentinglin/s1K-1.1-trl-format
11 | dataset_name: yentinglin/OpenR1-Math-220k-trl-format
12 | preprocessing_num_workers: 8
13 | 
14 | # SFT trainer config
15 | bf16: true
16 | do_eval: true
17 | eval_strategy: no
18 | gradient_accumulation_steps: 4
19 | gradient_checkpointing: true
20 | gradient_checkpointing_kwargs:
21 |   use_reentrant: false
22 | hub_model_id: Mistral-Small-24B-Instruct-2501-Open-R1-Distill
23 | hub_strategy: every_save
24 | learning_rate: 2.0e-05
25 | log_level: info
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine
29 | packing: true
30 | max_length: 32768
31 | max_steps: -1
32 | num_train_epochs: 5
33 | output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill
34 | overwrite_output_dir: true
35 | per_device_eval_batch_size: 1
36 | per_device_train_batch_size: 1
37 | push_to_hub: true
38 | report_to:
39 | - wandb
40 | save_strategy: epoch
41 | seed: 42
42 | warmup_ratio: 0.1
43 | 


--------------------------------------------------------------------------------
/training/open-r1/slurm/serve_router.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=r1-router
 3 | #SBATCH --partition=hopper-cpu
 4 | #SBATCH --qos=high
 5 | #SBATCH --nodes=1
 6 | #SBATCH --cpus-per-task=8
 7 | #SBATCH --mem-per-cpu=1875m
 8 | #SBATCH --output=./logs/%x_%j_%n.out
 9 | #SBATCH --error=./logs/%x_%j_%n.err
10 | #SBATCH --time=30-00:00:00
11 | #SBATCH --requeue
12 | 
13 | set -exuo pipefail
14 | 
15 | # TODO: Adjust these variables to your cluster configuration
16 | CONDA_ENV="sglang124"
17 | ROUTER_PORT=39876
18 | 
19 | trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
20 | 
21 | while getopts "e:h" opt; do
22 |     case $opt in
23 |         e) CONDA_ENV="$OPTARG" ;;
24 |         h|?) echo "Usage: sbatch $0 [-e CONDA_ENV]"; exit 1 ;;
25 |     esac
26 | done
27 | 
28 | # TODO: Environment setup, adjust to your cluster configuration
29 | source ~/.bashrc
30 | source "$CONDA_PREFIX/etc/profile.d/conda.sh"
31 | conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
32 | 
33 | python -m sglang_router.launch_router \
34 |     --port "$ROUTER_PORT" \
35 |     --host 0.0.0.0 \
36 |     --worker-startup-timeout-secs 300
37 | 
38 | # Keep the job running with health checks
39 | while true; do
40 |     if ! curl -s -o /dev/null "http://localhost:$ROUTER_PORT/health"; then
41 |         echo "Error: Router health check failed"
42 |         exit 1
43 |     fi
44 |     sleep 300
45 | done


--------------------------------------------------------------------------------
/training/open-r1/recipes/OpenR1-Qwen-7B/sft/config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
 4 | model_name_or_path: Qwen/Qwen2.5-Math-7B-Instruct 
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: sdpa
 8 | 
 9 | # Data training arguments
10 | dataset_name: open-r1/OpenR1-Math-220k
11 | dataset_num_proc: 48
12 | 
13 | #SFT hyperparam
14 | max_length: 32768
15 | weight_decay: 0.0001
16 | optim: adamw_torch
17 | lr_scheduler_type: linear
18 | warmup_ratio: 0.1
19 | learning_rate: 5.0e-05
20 | gradient_accumulation_steps: 2
21 | per_device_eval_batch_size: 1
22 | per_device_train_batch_size: 1
23 | 
24 | # SFT trainer config
25 | max_steps: -1
26 | num_train_epochs: 3
27 | bf16: true
28 | do_eval: false
29 | use_liger_kernel: true
30 | eval_strategy: 'no'
31 | gradient_checkpointing: true
32 | gradient_checkpointing_kwargs:
33 |   use_reentrant: false
34 | hub_model_id: OpenR1-Qwen-7B-SFT
35 | hub_strategy: every_save
36 | log_level: info
37 | logging_steps: 5
38 | logging_strategy: steps
39 | packing: true
40 | output_dir: data/OpenR1-Qwen-7B-SFT
41 | overwrite_output_dir: true
42 | push_to_hub: true
43 | report_to:
44 | - wandb
45 | save_strategy: "steps"
46 | save_steps: 500
47 | save_total_limit: 1
48 | seed: 42


--------------------------------------------------------------------------------
/modes/speculative_decoding.py:
--------------------------------------------------------------------------------
 1 | def run_speculative_decoding_flow(
 2 |     question: str,
 3 |     big_model: str,
 4 |     big_model_port: int,
 5 |     generate_text_vllm,
 6 |     max_tokens: int,
 7 |     temperature: float,
 8 |     test_logging: bool = False,
 9 | ):
10 |     # NOTE: calls V0 server, since V1 does not have specdec
11 |     resp_json, latency, metric = generate_text_vllm(
12 |         question,
13 |         port=big_model_port,
14 |         temperature=temperature,
15 |         max_tokens=max_tokens,
16 |         model=big_model,
17 |         speculative_decoding=True # NOTE: custom parameter in generate_text_vllm
18 |     )
19 | 
20 |     usage = resp_json.get("usage", {})
21 |     final_reply = resp_json["choices"][0]["text"]
22 | 
23 |     usage_data = [{
24 |         "Model":          big_model,
25 |         "ThinkIter":      "spec_decoding",
26 |         "DraftVersion":   0,
27 |         "PromptTokens":   usage.get("prompt_tokens", 0),
28 |         "CompletionTokens": usage.get("completion_tokens", 0),
29 |         "AcceptedTokens":   metric["accepted_tokens"], # NOTE: these values are wrong, which are fetched directly from /metrics.
30 |         "DraftTokens":      metric["draft_tokens"],
31 |         "EmittedTokens":    metric["emitted_tokens"],
32 |         "AcceptanceRate":   metric["acceptance_rate"],
33 |         "Efficiency":       metric["efficiency"],
34 |         "Latency":         latency,
35 |     }]
36 | 
37 |     return final_reply, usage_data


--------------------------------------------------------------------------------
/training/open-r1/recipes/SmolLM2-1.7B/sft/config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: sdpa
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/OpenR1-Math-220k 
10 | dataset_num_proc: 48
11 | 
12 | #SFT hyperparam
13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
14 | weight_decay: 0.0001
15 | optim: adamw_torch
16 | lr_scheduler_type: linear
17 | warmup_ratio: 0.1
18 | learning_rate: 5.0e-05
19 | gradient_accumulation_steps: 2
20 | per_device_eval_batch_size: 4
21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
22 | 
23 | # SFT trainer config
24 | max_steps: -1
25 | num_train_epochs: 3
26 | bf16: true
27 | do_eval: false
28 | eval_strategy: 'no'
29 | gradient_checkpointing: true
30 | gradient_checkpointing_kwargs:
31 |   use_reentrant: false
32 | hub_model_id: OpenR1-Qwen-7B-SFT
33 | hub_strategy: every_save
34 | log_level: info
35 | logging_steps: 5
36 | logging_strategy: steps
37 | packing: true
38 | output_dir: data/OpenR1-Qwen-7B-SFT
39 | overwrite_output_dir: true
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | save_strategy: "steps"
44 | save_steps: 500
45 | save_total_limit: 1
46 | seed: 42
47 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: sdpa
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/OpenR1-Math-220k
10 | dataset_num_proc: 48
11 | 
12 | #SFT hyperparam
13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
14 | weight_decay: 0.0001
15 | optim: adamw_torch
16 | lr_scheduler_type: linear
17 | warmup_ratio: 0.1
18 | learning_rate: 5.0e-05
19 | gradient_accumulation_steps: 2
20 | per_device_eval_batch_size: 4
21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
22 | 
23 | # SFT trainer config
24 | max_steps: -1
25 | num_train_epochs: 3
26 | bf16: true
27 | do_eval: false
28 | eval_strategy: 'no'
29 | gradient_checkpointing: true
30 | gradient_checkpointing_kwargs:
31 |   use_reentrant: false
32 | hub_model_id: OpenR1-Qwen-7B-SFT
33 | hub_strategy: every_save
34 | log_level: info
35 | logging_steps: 5
36 | logging_strategy: steps
37 | packing: true
38 | output_dir: data/OpenR1-Qwen-7B-SFT
39 | overwrite_output_dir: true
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | save_strategy: "steps"
44 | save_steps: 500
45 | save_total_limit: 1
46 | seed: 42
47 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/OlympicCoder-32B/sft/config_v00.00.yaml:
--------------------------------------------------------------------------------
 1 | # Config for 16 nodes of 8 H100s with FSDP1
 2 | # Model arguments
 3 | model_name_or_path: Qwen/Qwen2.5-Coder-32B-Instruct
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: flash_attention_2
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/codeforces-cots
10 | dataset_config: solutions_decontaminated
11 | dataset_num_proc: 12
12 | 
13 | # SFT trainer config
14 | bf16: true
15 | do_eval: false
16 | eval_strategy: 'no'
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_always_push: true
22 | hub_model_id: OlympicCoder-32B
23 | hub_strategy: every_save
24 | learning_rate: 4.0e-05
25 | log_level: info
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine_with_min_lr
29 | lr_scheduler_kwargs:
30 |   min_lr_rate: 0.1
31 | packing: false
32 | max_grad_norm: 0.2
33 | max_length: 22528 # we were unable to train at 32k due to OOM. See https://github.com/huggingface/transformers/issues/35983 for context parallelism support.
34 | max_steps: -1
35 | num_train_epochs: 10
36 | optim: paged_adamw_8bit
37 | output_dir: data/OlympicCoder-32B
38 | overwrite_output_dir: true
39 | per_device_eval_batch_size: 1
40 | per_device_train_batch_size: 1
41 | push_to_hub: true
42 | report_to:
43 | - wandb
44 | save_only_model: true # needed to bypass FSDP errors with saving paged optimizers
45 | save_strategy: epoch
46 | save_total_limit: 1
47 | seed: 42
48 | use_liger: false # fails on multi-node
49 | warmup_ratio: 0.03


--------------------------------------------------------------------------------
/training/trl/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # What does this PR do?
 2 | 
 3 | <!--
 4 | Congratulations! You've made it this far! You're not quite done yet though.
 5 | 
 6 | Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
 7 | 
 8 | Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
 9 | 
10 | Once you're done, someone will review your PR shortly. They may suggest changes to make the code even better.
11 | -->
12 | 
13 | <!-- Remove if not applicable -->
14 | 
15 | Fixes # (issue)
16 | 
17 | 
18 | ## Before submitting
19 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
20 | - [ ] Did you read the [contributor guideline](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md#create-a-pull-request),
21 |       Pull Request section?
22 | - [ ] Was this discussed/approved via a GitHub issue? Please add a link
23 |       to it if that's the case.
24 | - [ ] Did you make sure to update the documentation with your changes? Here are the
25 |       [documentation guidelines](https://github.com/huggingface/trl/tree/main/docs).
26 | - [ ] Did you write any new necessary tests?
27 | 
28 | 
29 | ## Who can review?
30 | 
31 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
32 | members/contributors who may be interested in your PR.


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime_2024_agg8.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime_2024_agg8
 4 | dataset_path: Maxwell-Jia/AIME_2024
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs_aime_2024
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text_aime_2024
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 4096 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
17 | repeats: 8
18 | filter_list:
19 |   - name: "all" # Will do coverage, majority, and take_first_k
20 |     filter:
21 |       - function: "take_first_k"
22 |         k: 8
23 | metric_list:
24 |   - metric: exact_match
25 |     aggregation: mean
26 |     higher_is_better: true
27 |   - metric: cov@8
28 |     aggregation: mean
29 |     higher_is_better: true
30 |   - metric: cov@4
31 |     aggregation: mean
32 |     higher_is_better: true
33 |   - metric: cov@2
34 |     aggregation: mean
35 |     higher_is_better: true
36 |   - metric: maj@8
37 |     aggregation: mean
38 |     higher_is_better: true
39 |   - metric: maj@4
40 |     aggregation: mean
41 |     higher_is_better: true
42 |   - metric: maj@2
43 |     aggregation: mean
44 |     higher_is_better: true
45 |   - metric: extracted_answers
46 |     aggregation: bypass
47 |     higher_is_better: true
48 |   - metric: exact_matches
49 |     aggregation: bypass
50 |     higher_is_better: true
51 | metadata:
52 |   version: 1.0


--------------------------------------------------------------------------------
/training/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-220k
 9 | dataset_prompt_column: problem
10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
11 | 
12 | # GRPO trainer config
13 | bf16: true
14 | use_vllm: true
15 | do_eval: false
16 | gradient_accumulation_steps: 4
17 | gradient_checkpointing: true
18 | gradient_checkpointing_kwargs:
19 |   use_reentrant: false
20 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
21 | hub_strategy: every_save
22 | learning_rate: 2.0e-05
23 | log_completions: true
24 | log_level: info
25 | logging_first_step: true
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine
29 | max_prompt_length: 512
30 | max_completion_length: 1024
31 | max_steps: -1
32 | num_generations: 16
33 | num_train_epochs: 1
34 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
35 | overwrite_output_dir: true
36 | per_device_eval_batch_size: 16
37 | per_device_train_batch_size: 16
38 | push_to_hub: true
39 | report_to:
40 | - wandb
41 | reward_funcs:
42 | - accuracy
43 | - format
44 | - tag_count
45 | reward_weights:
46 | - 1.0
47 | - 1.0
48 | - 1.0
49 | save_strategy: "epoch"
50 | save_total_limit: 1
51 | seed: 42
52 | warmup_ratio: 0.1
53 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-Math-7B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: DigitalLearningGmbH/MATH-lighteval
 9 | dataset_config: default
10 | dataset_prompt_column: problem
11 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
12 | 
13 | # GRPO trainer config
14 | bf16: true
15 | use_vllm: true
16 | do_eval: true
17 | eval_strategy: steps
18 | eval_steps: 100
19 | gradient_accumulation_steps: 8
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: Qwen-2.5-7B-Simple-RL
24 | hub_strategy: every_save
25 | learning_rate: 3.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 5
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine
32 | max_prompt_length: 512
33 | max_completion_length: 1024
34 | max_steps: -1
35 | num_generations: 7
36 | num_train_epochs: 1
37 | output_dir: data/Qwen-2.5-7B-Simple-RL
38 | overwrite_output_dir: true
39 | per_device_eval_batch_size: 16
40 | per_device_train_batch_size: 16
41 | push_to_hub: true
42 | report_to:
43 | - wandb
44 | reward_funcs:
45 | - accuracy
46 | - format
47 | reward_weights:
48 | - 1.0
49 | - 1.0
50 | save_strategy: "no"
51 | seed: 42
52 | warmup_ratio: 0.1
53 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/iterative_sft_trainer.md:
--------------------------------------------------------------------------------
 1 | # Iterative Trainer
 2 | 
 3 | [![](https://img.shields.io/badge/All_models-Iterative_SFT-blue)](https://huggingface.co/models?other=iterative-sft,trl)
 4 | 
 5 | 
 6 | Iterative fine-tuning is a training method that enables to perform custom actions (generation and filtering for example) between optimization steps. In TRL we provide an easy-to-use API to fine-tune your models in an iterative way in just a few lines of code.
 7 | 
 8 | ## Usage
 9 | 
10 | To get started quickly, instantiate an instance a model, and a tokenizer.
11 | 
12 | ```python
13 | 
14 | model = AutoModelForCausalLM.from_pretrained(model_name)
15 | tokenizer = AutoTokenizer.from_pretrained(model_name)
16 | if tokenizer.pad_token is None:
17 |     tokenizer.pad_token = tokenizer.eos_token
18 | 
19 | trainer = IterativeSFTTrainer(
20 |     model,
21 |     tokenizer
22 | )
23 | 
24 | ```
25 | 
26 | You have the choice to either provide a list of strings or a list of tensors to the step function. 
27 | 
28 | #### Using a list of tensors as input:
29 | 
30 | ```python
31 | 
32 | inputs = {
33 |     "input_ids": input_ids,
34 |     "attention_mask": attention_mask
35 | }
36 | 
37 | trainer.step(**inputs)
38 | 
39 | ```
40 | 
41 | #### Using a list of strings as input:
42 | 
43 | ```python
44 | 
45 | inputs = {
46 |     "texts": texts
47 | }
48 | 
49 | trainer.step(**inputs)
50 | 
51 | ```
52 | 
53 | For causal language models, labels will automatically be created from input_ids or from texts. When using sequence to sequence models you will have to provide your own labels or text_labels.
54 | 
55 | ## IterativeTrainer
56 | 
57 | [[autodoc]] IterativeSFTTrainer
58 | 


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, PreTrainedTokenizer
 2 | 
 3 | from trl import ModelConfig
 4 | 
 5 | from ..configs import GRPOConfig, SFTConfig
 6 | 
 7 | 
 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 9 | 
10 | 
11 | def get_tokenizer(
12 |     model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True
13 | ) -> PreTrainedTokenizer:
14 |     """Get the tokenizer for the model."""
15 |     tokenizer = AutoTokenizer.from_pretrained(
16 |         model_args.model_name_or_path,
17 |         revision=model_args.model_revision,
18 |         trust_remote_code=model_args.trust_remote_code,
19 |     )
20 |     # Disable in interest of time.
21 |     # special_tokens_dict = {"additional_special_tokens": ["<bigmodel>", "</bigmodel>"]}
22 |     # num_added = tokenizer.add_special_tokens(special_tokens_dict)
23 |     # if num_added > 0:
24 |     #     print(f"Added {num_added} special tokens for <bigmodel> orchestration.")
25 | 
26 |     if training_args.chat_template is not None:
27 |         tokenizer.chat_template = training_args.chat_template
28 |     elif auto_set_chat_template and tokenizer.get_chat_template() is None:
29 |         tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
30 | 
31 |     return tokenizer
32 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-7B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-cn_k12-86k
 9 | dataset_prompt_column: problem
10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
11 | 
12 | # GRPO trainer config
13 | beta: 0.001
14 | bf16: true
15 | do_eval: false
16 | eval_strategy: "no"
17 | use_vllm: true
18 | do_eval: false
19 | gradient_accumulation_steps: 16
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: Qwen2.5-7B-Instruct-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: constant_with_warmup
32 | max_grad_norm: 0.2
33 | max_prompt_length: 1024
34 | max_completion_length: 4096
35 | max_steps: -1
36 | num_generations: 16
37 | num_train_epochs: 1
38 | output_dir: data/Qwen2.5-7B-Instruct-GRPO
39 | overwrite_output_dir: true
40 | per_device_train_batch_size: 4
41 | push_to_hub: true
42 | report_to:
43 | - wandb
44 | reward_funcs:
45 | - accuracy
46 | - format
47 | reward_weights:
48 | - 1.0
49 | - 0.2
50 | save_strategy: "steps"
51 | save_steps: 0.1
52 | save_total_limit: 1
53 | seed: 42
54 | temperature: 0.7
55 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/training/trl/.github/workflows/tests_latest.yml:
--------------------------------------------------------------------------------
 1 | name: Tests latest TRL release with dev dependencies
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'  # Runs daily at midnight UTC
 6 | 
 7 |   workflow_dispatch:
 8 | 
 9 | env:
10 |   TQDM_DISABLE: 1
11 |   CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
12 | 
13 | jobs:
14 |   tests:
15 |     name: Tests latest TRL release with dev dependencies
16 |     runs-on: 'ubuntu-latest'
17 |     steps:
18 |       - name: Git checkout
19 |         uses: actions/checkout@v4
20 |         with: { ref: v0.16-release }
21 |       - name: Set up Python 3.12
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: '3.12'
25 |           cache: "pip"
26 |           cache-dependency-path: |
27 |               setup.py
28 |               requirements.txt
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           python -m pip install -U git+https://github.com/huggingface/accelerate.git
33 |           python -m pip install -U git+https://github.com/huggingface/datasets.git
34 |           python -m pip install -U git+https://github.com/huggingface/transformers.git
35 |           python -m pip install ".[dev]"
36 |       - name: Test with pytest
37 |         run: |
38 |           make test
39 |       - name: Post to Slack
40 |         uses: huggingface/hf-workflows/.github/actions/post-slack@main
41 |         with:
42 |           slack_channel: ${{ env.CI_SLACK_CHANNEL }}
43 |           title: Results of latest TRL with Python 3.12 on ubuntu-latest with dev dependencies
44 |           status: ${{ job.status }}
45 |           slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
46 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/deepspeed_integration.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed Integration
 2 | 
 3 | <Tip warning={true}>
 4 | 
 5 | Section under construction. Feel free to contribute!
 6 | 
 7 | </Tip>
 8 | 
 9 | TRL supports training with DeepSpeed, a library that implements advanced training optimization techniques. These include optimizer state partitioning, offloading, gradient partitioning, and more.
10 | 
11 | DeepSpeed integrates the [Zero Redundancy Optimizer (ZeRO)](https://huggingface.co/papers/1910.02054), which allows to scale the model size proportional to the number of devices with sustained high efficiency.
12 | 
13 | ![ZeRO Stages](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/zero_stages.png)
14 | 
15 | ## Installation
16 | 
17 | To use DeepSpeed with TRL, install it using the following command:
18 | 
19 | ```bash
20 | pip install deepspeed
21 | ```
22 | 
23 | ## Running Training Scripts with DeepSpeed
24 | 
25 | No modifications to your training script are required. Simply run it with the DeepSpeed configuration file:
26 | 
27 | ```bash
28 | accelerate launch --config_file <ACCELERATE_WITH_DEEPSPEED_CONFIG_FILE.yaml> train.py
29 | ```
30 | 
31 | We provide ready-to-use DeepSpeed configuration files in the [`examples/accelerate_configs`](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) directory. For example, to run training with ZeRO Stage 2, use the following command:
32 | 
33 | ```bash
34 | accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml train.py
35 | ```
36 | 
37 | ## Additional Resources
38 | 
39 | Consult the 🤗 Accelerate [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more information about the DeepSpeed plugin.
40 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/verifiable-coding-problems-python
 9 | dataset_prompt_column: problem_statement
10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
11 | 
12 | # GRPO trainer config
13 | beta: 0.01
14 | bf16: true
15 | use_vllm: true
16 | do_eval: false
17 | gradient_accumulation_steps: 4
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 5.0e-06
24 | log_completions: true
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: cosine_with_min_lr
30 | lr_scheduler_kwargs:
31 |   min_lr_rate: 0.1
32 | max_prompt_length: 1024
33 | max_completion_length: 2048
34 | max_steps: 500
35 | num_generations: 14
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
38 | overwrite_output_dir: true
39 | per_device_train_batch_size: 16
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | reward_funcs:
44 | - code
45 | - format
46 | reward_weights:
47 | - 1.0
48 | - 0.1
49 | save_strategy: "steps"
50 | save_steps: 50
51 | save_total_limit: 1
52 | seed: 42
53 | temperature: 1.0
54 | warmup_ratio: 0.03


--------------------------------------------------------------------------------
/annotated_dataset/hf_Dataset/proprocess.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, DatasetDict, Dataset, load_from_disk
 2 | from huggingface_hub import login
 3 | import os
 4 | 
 5 | our_dataset = load_from_disk("OpenR1_Math_SpeculativeReasoning")
 6 | 
 7 | print("Length pre filtering:", len(our_dataset))
 8 | # First, filter out the examples you want to exclude
 9 | filtered_dataset = our_dataset.filter(
10 |     lambda example: (
11 |         example["annotated_generations"] and 
12 |         "Error processing Due To" not in example["annotated_generations"][0] and 
13 |         len(example["annotated_generations"][0]) > 1024
14 |     )
15 | )
16 | 
17 | min_len = min(
18 |     len(example["annotated_generations"][0])
19 |     for example in filtered_dataset
20 |     if example["annotated_generations"]
21 | )
22 | shortest_examples = our_dataset.filter(
23 |     lambda example: (
24 |         example["annotated_generations"] and 
25 |         len(example["annotated_generations"][0]) == min_len
26 |     )
27 | )
28 | 
29 | print("Example of shortest example:", shortest_examples['generations'][0][0])
30 | print("Example of shortest example:", shortest_examples['annotated_generations'][0][0])
31 | 
32 | print("Length post filtering:", len(filtered_dataset))
33 | def replace_with_annotated(example):
34 |     if example["annotated_generations"]:
35 |         example["messages"][1]["content"] = example["annotated_generations"][0].replace("<\\bigmodel>", "\n </bigmodel> \n")
36 |         example["messages"][1]["content"] = example["annotated_generations"][0].replace("<bigmodel>", "\n <bigmodel> \n")
37 |         
38 |     return example
39 | 
40 | updated_dataset = filtered_dataset.map(replace_with_annotated)
41 | 
42 | updated_dataset.push_to_hub("akhauriyash/OpenR1_Math_SpeculativeReasoning")


--------------------------------------------------------------------------------
/training/trl/commands/run_sft.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script runs an SFT example end-to-end on a tiny model using different possible configurations
 3 | # but defaults to QLoRA + PEFT
 4 | OUTPUT_DIR="test_sft/"
 5 | MODEL_NAME="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
 6 | DATASET_NAME="stanfordnlp/imdb"
 7 | MAX_STEPS=5
 8 | BATCH_SIZE=2
 9 | SEQ_LEN=128
10 | 
11 | 
12 | # Handle extra arguments in case one passes accelerate configs.
13 | EXTRA_ACCELERATE_ARGS=""
14 | EXTRA_TRAINING_ARGS="""--use_peft \
15 |     --load_in_4bit
16 | """
17 | 
18 | # Set your number of GPUs here
19 | NUM_GPUS=2
20 | 
21 | if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
22 |   EXTRA_ACCELERATE_ARGS=""
23 | else
24 |   EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
25 |   # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
26 |   # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
27 |   if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
28 |     EXTRA_TRAINING_ARGS="--fp16"
29 |   else
30 |     echo "Keeping QLoRA + PEFT"
31 |   fi
32 | fi
33 | 
34 | 
35 | CMD="""
36 | accelerate launch $EXTRA_ACCELERATE_ARGS \
37 |     --num_processes $NUM_GPUS \
38 |     --mixed_precision 'fp16' \
39 |     `pwd`/trl/scripts/sft.py \
40 |     --model_name $MODEL_NAME \
41 |     --dataset_name $DATASET_NAME \
42 |     --output_dir $OUTPUT_DIR \
43 |     --max_steps $MAX_STEPS \
44 |     --per_device_train_batch_size $BATCH_SIZE \
45 |     --max_length $SEQ_LEN \
46 |     $EXTRA_TRAINING_ARGS
47 | """
48 | 
49 | echo "Starting program..."
50 | 
51 | { # try
52 |     echo $CMD
53 |     eval "$CMD"
54 | } || { # catch
55 |     # save log for exception 
56 |     echo "Operation Failed!"
57 |     exit 1
58 | }
59 | exit 0
60 | 


--------------------------------------------------------------------------------
/training/open-r1/slurm/piston/launch_single_piston.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=piston_worker
 3 | #SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out
 4 | #SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out  # Redirect error logs to .out
 5 | #SBATCH --cpus-per-task=2
 6 | #SBATCH --mem-per-cpu=1950M
 7 | #SBATCH --partition=hopper-cpu
 8 | #SBATCH --time=48:00:00
 9 | 
10 | # sometimes if a bunch of workers start at the same time pyxis dies
11 | sleep $(( RANDOM % 20 ))
12 | 
13 | # mounting the packages folder lets us not have to manually install the package on each instance
14 | # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
15 | # feel free try with the latest image
16 | # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
17 | srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
18 |     bash -c "
19 |     export PISTON_COMPILE_TIMEOUT=60000
20 |     export PISTON_RUN_TIMEOUT=60000
21 |     export PISTON_OUTPUT_MAX_SIZE=1000000000
22 |     export PISTON_MAX_FILE_SIZE=1000000000
23 |     export PISTON_DISABLE_NETWORKING=true
24 |     export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
25 | 
26 |     sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
27 |     sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
28 | 
29 |     # Start server in background
30 |     node src
31 |     "
32 | 


--------------------------------------------------------------------------------
/training/trl/commands/run_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script runs an SFT example end-to-end on a tiny model using different possible configurations
 3 | # but defaults to QLoRA + PEFT
 4 | OUTPUT_DIR="test_dpo/"
 5 | MODEL_NAME="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
 6 | DATASET_NAME="trl-internal-testing/hh-rlhf-helpful-base-trl-style"
 7 | MAX_STEPS=5
 8 | BATCH_SIZE=2
 9 | SEQ_LEN=128
10 | 
11 | # Handle extra arguments in case one passes accelerate configs.
12 | EXTRA_ACCELERATE_ARGS=""
13 | EXTRA_TRAINING_ARGS="""--use_peft \
14 |     --load_in_4bit
15 | """
16 | 
17 | # This is a hack to get the number of available GPUs
18 | NUM_GPUS=2
19 | 
20 | if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
21 |   EXTRA_ACCELERATE_ARGS=""
22 | else
23 |   EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
24 |   # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
25 |   # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
26 |   if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
27 |     EXTRA_TRAINING_ARGS="--fp16"
28 |   else
29 |     echo "Keeping QLoRA + PEFT"
30 |   fi
31 | fi
32 | 
33 | 
34 | CMD="""
35 | accelerate launch $EXTRA_ACCELERATE_ARGS \
36 |     --num_processes $NUM_GPUS \
37 |     --mixed_precision 'fp16' \
38 |     `pwd`/trl/scripts/dpo.py \
39 |     --model_name_or_path $MODEL_NAME \
40 |     --dataset_name $DATASET_NAME \
41 |     --output_dir $OUTPUT_DIR \
42 |     --max_steps $MAX_STEPS \
43 |     --per_device_train_batch_size $BATCH_SIZE \
44 |     --max_length $SEQ_LEN \
45 |     $EXTRA_TRAINING_ARGS
46 | """
47 | 
48 | echo "Starting program..."
49 | 
50 | { # try
51 |     echo $CMD
52 |     eval "$CMD"
53 | } || { # catch
54 |     # save log for exception 
55 |     echo "Operation Failed!"
56 |     exit 1
57 | }
58 | exit 0
59 | 


--------------------------------------------------------------------------------
/training/trl/trl/trainer/xpo_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass, field
16 | 
17 | from trl.trainer.online_dpo_config import OnlineDPOConfig
18 | 
19 | 
20 | @dataclass
21 | class XPOConfig(OnlineDPOConfig):
22 |     r"""
23 |     Configuration class for the [`XPOTrainer`].
24 | 
25 |     Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
26 | 
27 |     Parameters:
28 |         alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
29 |             Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
30 |             and the last alpha is used for the rest of the epochs.
31 |     """
32 | 
33 |     alpha: list[float] = field(
34 |         default_factory=lambda: [1e-5],
35 |         metadata={
36 |             "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each "
37 |             "new epoch and the last alpha is used for the rest of the epochs."
38 |         },
39 |     )
40 | 
41 |     def __post_init__(self):
42 |         super().__post_init__()
43 |         if hasattr(self.alpha, "__len__") and len(self.alpha) == 1:
44 |             self.alpha = self.alpha[0]
45 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime_2024_rebase.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime_2024_rebase
 4 | dataset_path: Maxwell-Jia/AIME_2024
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs_aime_2024
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text_aime_2024
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 0
16 |   max_gen_toks: 4096 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |   - metric: cov@64
22 |     aggregation: mean
23 |     higher_is_better: true
24 |   - metric: cov@32
25 |     aggregation: mean
26 |     higher_is_better: true
27 |   - metric: cov@16
28 |     aggregation: mean
29 |     higher_is_better: true
30 |   - metric: cov@8
31 |     aggregation: mean
32 |     higher_is_better: true
33 |   - metric: cov@4
34 |     aggregation: mean
35 |     higher_is_better: true
36 |   - metric: cov@2
37 |     aggregation: mean
38 |     higher_is_better: true
39 |   - metric: maj@64
40 |     aggregation: mean
41 |     higher_is_better: true
42 |   - metric: maj@32
43 |     aggregation: mean
44 |     higher_is_better: true
45 |   - metric: maj@16
46 |     aggregation: mean
47 |     higher_is_better: true
48 |   - metric: maj@8
49 |     aggregation: mean
50 |     higher_is_better: true
51 |   - metric: maj@4
52 |     aggregation: mean
53 |     higher_is_better: true
54 |   - metric: maj@2
55 |     aggregation: mean
56 |     higher_is_better: true
57 |   - metric: extracted_answers
58 |     aggregation: bypass
59 |     higher_is_better: true
60 |   - metric: exact_matches
61 |     aggregation: bypass
62 |     higher_is_better: true
63 | metadata:
64 |   version: 1.0


--------------------------------------------------------------------------------
/training/open-r1/src/open_r1/utils/ioi/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from functools import lru_cache
 3 | from itertools import islice
 4 | 
 5 | from datasets import load_dataset
 6 | 
 7 | 
 8 | def add_includes(code: str, problem_id: str) -> str:
 9 |     """
10 |     Fix common compilation errors for IOI problems.
11 |     """
12 |     if not code:
13 |         return code
14 |     # has most of the useful functions
15 |     code_header = "#include <bits/stdc++.h>\n"
16 |     # include the problem header
17 |     problem_header_include = f'#include "{problem_id}.h"'
18 |     if problem_header_include not in code:
19 |         code_header += problem_header_include + "\n"
20 |     # use namespace std since models forget std:: often
21 |     if "using namespace std;" not in code and "std::" not in code:
22 |         code_header += "\nusing namespace std;\n\n"
23 |     return code_header + code
24 | 
25 | 
26 | @lru_cache
27 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
28 |     """
29 |     Load IOI tests for a given year.
30 |     """
31 |     tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
32 |     test_cases = defaultdict(dict)
33 |     for test_case in tests_dataset:
34 |         test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
35 |     return test_cases
36 | 
37 | 
38 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
39 |     """
40 |     Load IOI tests for a given year and problem id.
41 |     """
42 |     return load_ioi_tests_for_year(year)[problem_id]
43 | 
44 | 
45 | def batched(iterable, n):
46 |     "Batch data into lists of length n. The last batch may be shorter."
47 |     # batched('ABCDEFG', 3) --> ABC DEF G
48 |     if n < 1:
49 |         return iterable
50 |     it = iter(iterable)
51 |     while batch := list(islice(it, n)):
52 |         yield batch
53 | 


--------------------------------------------------------------------------------
/training/trl/tests/test_core.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import unittest
16 | 
17 | import torch
18 | 
19 | from trl.core import masked_mean, masked_var, masked_whiten
20 | 
21 | 
22 | class CoreTester(unittest.TestCase):
23 |     """
24 |     A wrapper class for testing core utils functions
25 |     """
26 | 
27 |     def setUp(self):
28 |         self.test_input = torch.Tensor([1, 2, 3, 4])
29 |         self.test_mask = torch.Tensor([0, 1, 1, 0])
30 |         self.test_input_unmasked = self.test_input[1:3]
31 | 
32 |     def test_masked_mean(self):
33 |         self.assertEqual(torch.mean(self.test_input_unmasked), masked_mean(self.test_input, self.test_mask))
34 | 
35 |     def test_masked_var(self):
36 |         self.assertEqual(torch.var(self.test_input_unmasked), masked_var(self.test_input, self.test_mask))
37 | 
38 |     def test_masked_whiten(self):
39 |         def whiten(values: torch.Tensor) -> torch.Tensor:
40 |             mean, var = torch.mean(values), torch.var(values)
41 |             return (values - mean) * torch.rsqrt(var + 1e-8)
42 | 
43 |         whiten_unmasked = whiten(self.test_input_unmasked)
44 |         whiten_masked = masked_whiten(self.test_input, self.test_mask)[1:3]
45 |         diffs = (whiten_unmasked - whiten_masked).sum()
46 |         self.assertLess(abs(diffs.item()), 0.00001)
47 | 


--------------------------------------------------------------------------------
/training/open-r1/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style quality
 2 | 
 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 4 | export PYTHONPATH = src
 5 | 
 6 | check_dirs := src tests
 7 | 
 8 | 
 9 | # dev dependencies
10 | install:
11 | 	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
12 | 	uv pip install vllm==0.7.2
13 | 	uv pip install setuptools
14 | 	uv pip install flash-attn --no-build-isolation
15 | 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
16 | 
17 | style:
18 | 	ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
19 | 	isort $(check_dirs) setup.py
20 | 
21 | quality:
22 | 	ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py
23 | 	isort --check-only $(check_dirs) setup.py
24 | 	flake8 --max-line-length 119 $(check_dirs) setup.py
25 | 
26 | test:
27 | 	pytest -sv --ignore=tests/slow/ tests/
28 | 
29 | slow_test:
30 | 	pytest -sv -vv tests/slow/
31 | 
32 | # Evaluation
33 | 
34 | evaluate:
35 | 	$(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \
36 | 		if [ "$(PARALLEL)" = "data" ]; then \
37 | 			echo "data_parallel_size=$(NUM_GPUS)"; \
38 | 		elif [ "$(PARALLEL)" = "tensor" ]; then \
39 | 			echo "tensor_parallel_size=$(NUM_GPUS)"; \
40 | 		fi \
41 | 	),))
42 | 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
43 | 	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
44 | 	if [ "$(TASK)" = "lcb" ]; then \
45 | 		lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
46 | 			--use-chat-template \
47 | 			--output-dir data/evals/$(MODEL); \
48 | 	else \
49 | 		lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
50 | 			--custom-tasks src/open_r1/evaluate.py \
51 | 			--use-chat-template \
52 | 			--output-dir data/evals/$(MODEL); \
53 | 	fi
54 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime25_nofigures_agg64.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime25_nofigures_agg64
 4 | dataset_path: TIGER-Lab/AIME25
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 1
16 |   max_gen_toks: 32768
17 | repeats: 64
18 | filter_list:
19 |   - name: "all" # Will do coverage, majority, and take_first_k
20 |     filter:
21 |       - function: "take_first_k"
22 |         k: 64
23 | metric_list:
24 |   - metric: exact_match
25 |     aggregation: mean
26 |     higher_is_better: true
27 |   - metric: cov@64
28 |     aggregation: mean
29 |     higher_is_better: true
30 |   - metric: cov@32
31 |     aggregation: mean
32 |     higher_is_better: true
33 |   - metric: cov@16
34 |     aggregation: mean
35 |     higher_is_better: true
36 |   - metric: cov@8
37 |     aggregation: mean
38 |     higher_is_better: true
39 |   - metric: cov@4
40 |     aggregation: mean
41 |     higher_is_better: true
42 |   - metric: cov@2
43 |     aggregation: mean
44 |     higher_is_better: true
45 |   - metric: maj@64
46 |     aggregation: mean
47 |     higher_is_better: true
48 |   - metric: maj@32
49 |     aggregation: mean
50 |     higher_is_better: true
51 |   - metric: maj@16
52 |     aggregation: mean
53 |     higher_is_better: true
54 |   - metric: maj@8
55 |     aggregation: mean
56 |     higher_is_better: true
57 |   - metric: maj@4
58 |     aggregation: mean
59 |     higher_is_better: true
60 |   - metric: maj@2
61 |     aggregation: mean
62 |     higher_is_better: true
63 |   - metric: extracted_answers
64 |     aggregation: bypass
65 |     higher_is_better: true
66 |   - metric: exact_matches
67 |     aggregation: bypass
68 |     higher_is_better: true
69 | metadata:
70 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math_maj64_cov64_train.yaml:
--------------------------------------------------------------------------------
 1 | group:
 2 |   - math_word_problems
 3 | task: openai_math_maj64_cov64_train
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: train
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until: []
13 |   do_sample: true
14 |   temperature: 0.5
15 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
16 | metric_list:
17 |   - metric: exact_match
18 |     aggregation: mean
19 |     higher_is_better: true
20 | repeats: 64
21 | filter_list:
22 |   - name: "score-first" # pick only the first response, and report metrics on that
23 |     filter:
24 |       - function: "take_first"
25 |   - name: "maj@64"
26 |     filter:
27 |       - function: "majority_vote"
28 |       - function: "take_first"
29 |   - name: "maj@16" # get Maj@16, via selecting the first 8 responses. Using a better estimator would be optimal.
30 |     filter:
31 |       - function: "take_first_k"
32 |         k: 16
33 |       - function: "majority_vote"
34 |       - function: "take_first"
35 |   - name: "maj@8" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal.
36 |     filter:
37 |       - function: "take_first_k"
38 |         k: 8
39 |       - function: "majority_vote"
40 |       - function: "take_first"
41 |   - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator.
42 |     filter:
43 |       - function: "take_first_k"
44 |         k: 64
45 |   - name: "cov@16"
46 |     filter:
47 |       - function: "take_first_k"
48 |         k: 16
49 |   - name: "cov@8"
50 |     filter:
51 |       - function: "take_first_k"
52 |         k: 8
53 | metadata:
54 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime24_figures_agg64.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime24_figures_agg64
 4 | dataset_path: simplescaling/aime24_figures
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 1
16 |   max_gen_toks: 32768
17 | repeats: 64
18 | filter_list:
19 |   - name: "all" # Will do coverage, majority, and take_first_k
20 |     filter:
21 |       - function: "take_first_k"
22 |         k: 64
23 | metric_list:
24 |   - metric: exact_match
25 |     aggregation: mean
26 |     higher_is_better: true
27 |   - metric: cov@64
28 |     aggregation: mean
29 |     higher_is_better: true
30 |   - metric: cov@32
31 |     aggregation: mean
32 |     higher_is_better: true
33 |   - metric: cov@16
34 |     aggregation: mean
35 |     higher_is_better: true
36 |   - metric: cov@8
37 |     aggregation: mean
38 |     higher_is_better: true
39 |   - metric: cov@4
40 |     aggregation: mean
41 |     higher_is_better: true
42 |   - metric: cov@2
43 |     aggregation: mean
44 |     higher_is_better: true
45 |   - metric: maj@64
46 |     aggregation: mean
47 |     higher_is_better: true
48 |   - metric: maj@32
49 |     aggregation: mean
50 |     higher_is_better: true
51 |   - metric: maj@16
52 |     aggregation: mean
53 |     higher_is_better: true
54 |   - metric: maj@8
55 |     aggregation: mean
56 |     higher_is_better: true
57 |   - metric: maj@4
58 |     aggregation: mean
59 |     higher_is_better: true
60 |   - metric: maj@2
61 |     aggregation: mean
62 |     higher_is_better: true
63 |   - metric: extracted_answers
64 |     aggregation: bypass
65 |     higher_is_better: true
66 |   - metric: exact_matches
67 |     aggregation: bypass
68 |     higher_is_better: true
69 | metadata:
70 |   version: 1.0


--------------------------------------------------------------------------------
/training/trl/examples/research_projects/stack_llama/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model.
 2 | There were three main steps to the training process:
 3 | 1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se:
 4 |     - `torchrun --nnodes 1  --nproc_per_node 8 examples/research_projects/stack_llama/scripts/supervised_finetuning.py --model_path=<LLAMA_MODEL_PATH> --streaming --learning_rate 1e-5 --max_steps 5000 --output_dir ./llama-se`
 5 | 2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm:
 6 |     - `torchrun --nnodes 1  --nproc_per_node 8 examples/research_projects/stack_llama/scripts/reward_modeling.py --model_name=<LLAMA_SE_MODEL>`
 7 | 3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model:
 8 |     - `accelerate launch --multi_gpu --num_machines 1  --num_processes 8 examples/research_projects/stack_llama/scripts/rl_training.py --log_with=wandb --model_name=<LLAMA_SE_MODEL> --reward_model_name=<LLAMA_SE_RM_MODEL> --adafactor=False --tokenizer_name=<LLAMA_TOKENIZER> --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam`
 9 | 
10 | 
11 | LoRA layers were using at all stages to reduce memory requirements. 
12 | At each stage the peft adapter layers were merged with the base model, using: 
13 | ```shell
14 | python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ
15 | ```
16 | Note that this script requires `peft>=0.3.0`.
17 | 
18 | For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform).
19 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime24_nofigures_agg64.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime24_nofigures_agg64
 4 | dataset_path: simplescaling/aime24_nofigures
 5 | dataset_name: default
 6 | process_docs: !function utils.process_docs
 7 | output_type: generate_until
 8 | test_split: train
 9 | doc_to_text: !function utils.doc_to_text
10 | doc_to_target: answer
11 | process_results: !function utils.process_results
12 | generation_kwargs:
13 |   until: []
14 |   do_sample: false
15 |   temperature: 1
16 |   max_gen_toks: 32768
17 | repeats: 64
18 | filter_list:
19 |   - name: "all" # Will do coverage, majority, and take_first_k
20 |     filter:
21 |       - function: "take_first_k"
22 |         k: 64
23 | metric_list:
24 |   - metric: exact_match
25 |     aggregation: mean
26 |     higher_is_better: true
27 |   - metric: cov@64
28 |     aggregation: mean
29 |     higher_is_better: true
30 |   - metric: cov@32
31 |     aggregation: mean
32 |     higher_is_better: true
33 |   - metric: cov@16
34 |     aggregation: mean
35 |     higher_is_better: true
36 |   - metric: cov@8
37 |     aggregation: mean
38 |     higher_is_better: true
39 |   - metric: cov@4
40 |     aggregation: mean
41 |     higher_is_better: true
42 |   - metric: cov@2
43 |     aggregation: mean
44 |     higher_is_better: true
45 |   - metric: maj@64
46 |     aggregation: mean
47 |     higher_is_better: true
48 |   - metric: maj@32
49 |     aggregation: mean
50 |     higher_is_better: true
51 |   - metric: maj@16
52 |     aggregation: mean
53 |     higher_is_better: true
54 |   - metric: maj@8
55 |     aggregation: mean
56 |     higher_is_better: true
57 |   - metric: maj@4
58 |     aggregation: mean
59 |     higher_is_better: true
60 |   - metric: maj@2
61 |     aggregation: mean
62 |     higher_is_better: true
63 |   - metric: extracted_answers
64 |     aggregation: bypass
65 |     higher_is_better: true
66 |   - metric: exact_matches
67 |     aggregation: bypass
68 |     higher_is_better: true
69 | metadata:
70 |   version: 1.0


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime25_nofigures_maj8cov8.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime25_nofigures_maj8cov8
 4 | dataset_path: TIGER-Lab/AIME25
 5 | dataset_name: default
 6 | 
 7 | process_docs: !function utils.process_docs
 8 | doc_to_text:  !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | 
12 | output_type: generate_until
13 | test_split: train
14 | 
15 | generation_kwargs:
16 |   until: []
17 |   do_sample: false          # deterministic
18 |   temperature: 0.6
19 |   max_gen_toks: 32768
20 | 
21 | repeats: 8                  # 8 samples per problem
22 | 
23 | # one catch-all slice — utils.process_results will compute cov/maj on it
24 | filter_list:
25 |   - name: "all"
26 |     filter:
27 |       - function: "take_first_k"
28 |         k: 8
29 | 
30 | metric_list:
31 |   - metric: exact_match     # plain accuracy
32 |     aggregation: mean
33 |     higher_is_better: true
34 |     
35 |   - metric: cov@2           # oracle coverage over 8
36 |     aggregation: mean
37 |     higher_is_better: true
38 | 
39 |   - metric: maj@2           # majority-vote accuracy over 8
40 |     aggregation: mean
41 |     higher_is_better: true
42 | 
43 | 
44 |   - metric: cov@4           # oracle coverage over 8
45 |     aggregation: mean
46 |     higher_is_better: true
47 | 
48 |   - metric: maj@4           # majority-vote accuracy over 8
49 |     aggregation: mean
50 |     higher_is_better: true
51 | 
52 | 
53 |   - metric: cov@8           # oracle coverage over 8
54 |     aggregation: mean
55 |     higher_is_better: true
56 | 
57 |   - metric: maj@8           # majority-vote accuracy over 8
58 |     aggregation: mean
59 |     higher_is_better: true
60 | 
61 |   # two “bypass” metrics emitted by utils.process_results
62 |   - metric: extracted_answers
63 |     aggregation: bypass
64 |     higher_is_better: true
65 | 
66 |   - metric: exact_matches
67 |     aggregation: bypass
68 |     higher_is_better: true
69 | 
70 | metadata:
71 |   version: 1.0
72 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/aime24_nofigures_maj8cov8.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: aime24_nofigures_maj8cov8
 4 | dataset_path: simplescaling/aime24_nofigures
 5 | dataset_name: default
 6 | 
 7 | process_docs: !function utils.process_docs
 8 | doc_to_text:  !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | 
12 | output_type: generate_until
13 | test_split: train
14 | 
15 | generation_kwargs:
16 |   until: []
17 |   do_sample: false          # deterministic
18 |   temperature: 0.6
19 |   max_gen_toks: 32768
20 | 
21 | repeats: 8                  # 8 samples per problem
22 | 
23 | # one catch-all slice — utils.process_results will compute cov/maj on it
24 | filter_list:
25 |   - name: "all"
26 |     filter:
27 |       - function: "take_first_k"
28 |         k: 8
29 | 
30 | metric_list:
31 |   - metric: exact_match     # plain accuracy
32 |     aggregation: mean
33 |     higher_is_better: true
34 |     
35 |   - metric: cov@2           # oracle coverage over 8
36 |     aggregation: mean
37 |     higher_is_better: true
38 | 
39 |   - metric: maj@2           # majority-vote accuracy over 8
40 |     aggregation: mean
41 |     higher_is_better: true
42 | 
43 | 
44 |   - metric: cov@4           # oracle coverage over 8
45 |     aggregation: mean
46 |     higher_is_better: true
47 | 
48 |   - metric: maj@4           # majority-vote accuracy over 8
49 |     aggregation: mean
50 |     higher_is_better: true
51 | 
52 | 
53 |   - metric: cov@8           # oracle coverage over 8
54 |     aggregation: mean
55 |     higher_is_better: true
56 | 
57 |   - metric: maj@8           # majority-vote accuracy over 8
58 |     aggregation: mean
59 |     higher_is_better: true
60 | 
61 |   # two “bypass” metrics emitted by utils.process_results
62 |   - metric: extracted_answers
63 |     aggregation: bypass
64 |     higher_is_better: true
65 | 
66 |   - metric: exact_matches
67 |     aggregation: bypass
68 |     higher_is_better: true
69 | 
70 | metadata:
71 |   version: 1.0


--------------------------------------------------------------------------------
/training/open-r1/scripts/upload_details.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Push the details from a LightEval run to the Hub.
17 | 
18 | Usage:
19 | 
20 | python src/open_r1/utils/upload_details.py \
21 |     --data_files {path_to_parquet_file} \
22 |     --hub_repo_id {hub_repo_id} \
23 |     --config_name {config_name}
24 | """
25 | 
26 | from dataclasses import dataclass, field
27 | from typing import List
28 | 
29 | from datasets import load_dataset
30 | from transformers import HfArgumentParser
31 | 
32 | 
33 | @dataclass
34 | class ScriptArguments:
35 |     data_files: List[str] = field(default_factory=list)
36 |     hub_repo_id: str = None
37 |     config_name: str = None
38 | 
39 | 
40 | def main():
41 |     parser = HfArgumentParser(ScriptArguments)
42 |     args = parser.parse_args_into_dataclasses()[0]
43 | 
44 |     if all(file.endswith(".json") for file in args.data_files):
45 |         ds = load_dataset("json", data_files=args.data_files)
46 |     elif all(file.endswith(".jsonl") for file in args.data_files):
47 |         ds = load_dataset("json", data_files=args.data_files)
48 |     else:
49 |         ds = load_dataset("parquet", data_files=args.data_files)
50 |     url = ds.push_to_hub(args.hub_repo_id, config_name=args.config_name, private=True)
51 |     print(f"Dataset available at: {url}")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/training/trl/trl/trainer/nash_md_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass, field
16 | 
17 | from trl.trainer.online_dpo_config import OnlineDPOConfig
18 | 
19 | 
20 | @dataclass
21 | class NashMDConfig(OnlineDPOConfig):
22 |     r"""
23 |     Configuration class for the [`NashMDTrainer`].
24 | 
25 |     Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
26 | 
27 |     Parameters:
28 |         mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
29 |             Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
30 |             mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
31 |             epochs.
32 |     """
33 | 
34 |     mixture_coef: list[float] = field(
35 |         default_factory=lambda: [0.5],
36 |         metadata={
37 |             "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided "
38 |             "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the "
39 |             "rest of the epochs."
40 |         },
41 |     )
42 | 
43 |     def __post_init__(self):
44 |         super().__post_init__()
45 |         if hasattr(self.mixture_coef, "__len__") and len(self.mixture_coef) == 1:
46 |             self.mixture_coef = self.mixture_coef[0]
47 | 


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math_agg64.yaml:
--------------------------------------------------------------------------------
 1 | tag:
 2 |   - math_word_problems
 3 | task: openai_math_agg64
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: test
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until: []
13 |   do_sample: false
14 |   temperature: 1
15 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27  
16 | repeats: 64
17 | filter_list:
18 |   - name: "all" # Will do coverage, majority, and take_first_k
19 |     filter:
20 |       - function: "take_first_k"
21 |         k: 64
22 | metric_list:
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true
26 |   - metric: cov@64
27 |     aggregation: mean
28 |     higher_is_better: true
29 |   - metric: cov@32
30 |     aggregation: mean
31 |     higher_is_better: true
32 |   - metric: cov@16
33 |     aggregation: mean
34 |     higher_is_better: true
35 |   - metric: cov@8
36 |     aggregation: mean
37 |     higher_is_better: true
38 |   - metric: cov@4
39 |     aggregation: mean
40 |     higher_is_better: true
41 |   - metric: cov@2
42 |     aggregation: mean
43 |     higher_is_better: true
44 |   - metric: maj@64
45 |     aggregation: mean
46 |     higher_is_better: true
47 |   - metric: maj@32
48 |     aggregation: mean
49 |     higher_is_better: true
50 |   - metric: maj@16
51 |     aggregation: mean
52 |     higher_is_better: true
53 |   - metric: maj@8
54 |     aggregation: mean
55 |     higher_is_better: true
56 |   - metric: maj@4
57 |     aggregation: mean
58 |     higher_is_better: true
59 |   - metric: maj@2
60 |     aggregation: mean
61 |     higher_is_better: true
62 |   - metric: extracted_answers
63 |     aggregation: bypass
64 |     higher_is_better: true
65 |   - metric: exact_matches
66 |     aggregation: bypass
67 |     higher_is_better: true
68 | metadata:
69 |   version: 1.0


--------------------------------------------------------------------------------
/training/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/ioi
 9 | dataset_prompt_column: problem
10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
11 | 
12 | # GRPO trainer config
13 | beta: 0.01
14 | bf16: true
15 | use_vllm: true
16 | do_eval: false
17 | gradient_accumulation_steps: 4
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 5.0e-06
24 | log_completions: true
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: cosine_with_min_lr
30 | lr_scheduler_kwargs:
31 |   min_lr_rate: 0.1
32 | max_prompt_length: 1024
33 | max_completion_length: 2048
34 | max_steps: 500
35 | num_generations: 14
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
38 | overwrite_output_dir: true
39 | per_device_train_batch_size: 16
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | save_strategy: "steps"
44 | save_steps: 50
45 | save_total_limit: 1
46 | seed: 42
47 | temperature: 1.0
48 | warmup_ratio: 0.03
49 | # ioi specific config
50 | code_language: cpp
51 | reward_funcs:
52 | - ioi_code
53 | - code_format
54 | - format
55 | reward_weights:
56 | - 1.0
57 | - 0.1
58 | - 0.1
59 | # for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
60 | # otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
61 | code_eval_test_batch_size: 3


--------------------------------------------------------------------------------
/lm_eval_files/openai_math/openai_math_maj64_cov64.yaml:
--------------------------------------------------------------------------------
 1 | group:
 2 |   - math_word_problems
 3 | task: openai_math_maj64_cov64
 4 | dataset_path: simplescaling/openaimath
 5 | process_docs: !function utils.process_docs
 6 | output_type: generate_until
 7 | test_split: test
 8 | doc_to_text: !function utils.doc_to_text
 9 | doc_to_target: answer
10 | process_results: !function utils.process_results
11 | generation_kwargs:
12 |   until: []
13 |   do_sample: true
14 |   temperature: 0.5
15 |   max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
16 | metric_list:
17 |   - metric: exact_match
18 |     aggregation: mean
19 |     higher_is_better: true
20 | repeats: 64
21 | filter_list:
22 |   - name: "score-first" # pick only the first response, and report metrics on that
23 |     filter:
24 |       - function: "take_first"
25 |   - name: "maj@64"
26 |     filter:
27 |       - function: "majority_vote"
28 |       - function: "take_first"
29 |   - name: "maj@16" # get Maj@16, via selecting the first 8 responses. Using a better estimator would be optimal.
30 |     filter:
31 |       - function: "take_first_k"
32 |         k: 16
33 |       - function: "majority_vote"
34 |       - function: "take_first"
35 |   - name: "maj@32" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal.
36 |     filter:
37 |       - function: "take_first_k"
38 |         k: 32
39 |       - function: "majority_vote"
40 |       - function: "take_first"
41 |   - name: "maj@8" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal.
42 |     filter:
43 |       - function: "take_first_k"
44 |         k: 8
45 |       - function: "majority_vote"
46 |       - function: "take_first"
47 |   - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator.
48 |     filter:
49 |       - function: "take_first_k"
50 |         k: 64
51 |   - name: "cov@16"
52 |     filter:
53 |       - function: "take_first_k"
54 |         k: 16
55 |   - name: "cov@8"
56 |     filter:
57 |       - function: "take_first_k"
58 |         k: 8
59 | metadata:
60 |   version: 1.0


--------------------------------------------------------------------------------
/training/trl/examples/scripts/sft_gemma3.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Train Gemma-3 on the Codeforces COTS dataset.
17 | 
18 | accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml examples/scripts/sft_gemma3.py
19 | """
20 | 
21 | from datasets import load_dataset
22 | from transformers import AutoModelForImageTextToText
23 | 
24 | from trl import SFTConfig, SFTTrainer
25 | 
26 | 
27 | def main():
28 |     # Load dataset
29 |     train_dataset = load_dataset("open-r1/codeforces-cots", split="train")
30 |     train_dataset = train_dataset.remove_columns("prompt")
31 | 
32 |     # Load model
33 |     model_id = "google/gemma-3-12b-it"
34 |     model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
35 | 
36 |     # Train model
37 |     training_args = SFTConfig(
38 |         output_dir=f"{model_id}-codeforces-SFT",
39 |         logging_steps=10,
40 |         bf16=True,
41 |         use_liger_kernel=True,
42 |         gradient_checkpointing=True,
43 |         gradient_checkpointing_kwargs={"use_reentrant": False},
44 |         max_length=8192,
45 |         per_device_train_batch_size=1,
46 |         gradient_accumulation_steps=8,
47 |         dataset_num_proc=32,
48 |         num_train_epochs=1,
49 |     )
50 |     trainer = SFTTrainer(
51 |         args=training_args,
52 |         model=model,
53 |         train_dataset=train_dataset,
54 |     )
55 |     trainer.train()
56 | 
57 |     # Push to hub
58 |     trainer.push_to_hub(dataset_name="open-r1/codeforces-cots")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/annotated_dataset/annotation_statistics.csv:
--------------------------------------------------------------------------------
 1 | qid,model,offload_chars,total_chars,offload_tokens,total_tokens,offload_tokens_per_offload_chars,offload_percentage
 2 | 0,GPT4o,1694,7601,649,2797,46.357142857142854,22.29
 3 | 1,GPT4o,2151,7948,536,2802,178.66666666666666,27.06
 4 | 2,GPT4o,2661,7387,654,2330,654.0,36.02
 5 | 3,GPT4o,4508,12079,1086,3362,543.0,37.32
 6 | 4,GPT4o,20755,50830,6046,16003,863.7142857142857,40.83
 7 | 5,GPT4o,20359,53514,5916,16353,394.4,38.04
 8 | 6,GPT4o,2222,6133,659,1955,219.66666666666666,36.23
 9 | 7,GPT4o,1796,6067,590,1907,34.705882352941174,29.60
10 | 8,GPT4o,1138,4058,396,1512,79.2,28.04
11 | 9,GPT4o,1328,4823,466,1849,77.66666666666667,27.53
12 | 10,GPT4o,9997,28348,4045,10348,139.48275862068965,35.27
13 | 11,GPT4o,11383,27390,4012,9572,401.2,41.56
14 | 12,GPT4o,2222,9725,877,3647,109.625,22.85
15 | 13,GPT4o,1773,6931,554,2646,55.4,25.58
16 | 14,GPT4o,2332,8405,948,3210,105.33333333333333,27.75
17 | 15,GPT4o,2546,11853,923,4225,307.6666666666667,21.48
18 | 16,GPT4o,10267,36730,3581,12565,716.2,27.95
19 | 17,GPT4o,5054,23599,2018,8501,672.6666666666666,21.42
20 | 18,GPT4o,3009,9190,974,3165,974.0,32.74
21 | 19,GPT4o,3183,12106,963,3900,321.0,26.29
22 | 20,GPT4o,5757,17304,1825,6325,608.3333333333334,33.27
23 | 21,GPT4o,5362,17201,1870,2740,935.0,31.17
24 | 0,Dpsr1,1755,7601,663,2797,44.2,23.09
25 | 1,Dpsr1,1767,7948,594,2802,66.0,22.23
26 | 2,Dpsr1,2872,7387,706,2330,353.0,38.88
27 | 3,Dpsr1,4866,12079,1166,3362,388.6666666666667,40.28
28 | 4,Dpsr1,9438,50830,2497,16003,624.25,18.57
29 | 5,Dpsr1,9886,53514,2539,16353,846.3333333333334,18.47
30 | 6,Dpsr1,2569,6133,768,1955,128.0,41.89
31 | 7,Dpsr1,2067,6067,590,1907,590.0,34.07
32 | 8,Dpsr1,1149,4058,391,1512,65.16666666666667,28.31
33 | 9,Dpsr1,1208,4823,428,1849,47.55555555555556,25.05
34 | 10,Dpsr1,6412,28348,2363,10348,236.3,22.62
35 | 11,Dpsr1,7152,27390,2545,9572,509.0,26.11
36 | 12,Dpsr1,2698,9725,1024,3647,85.33333333333333,27.74
37 | 13,Dpsr1,2000,6931,630,2646,630.0,28.86
38 | 14,Dpsr1,2254,8405,899,3210,149.83333333333334,26.82
39 | 15,Dpsr1,3002,11853,1125,4225,93.75,25.33
40 | 16,Dpsr1,6112,36730,2340,12565,137.64705882352942,16.64
41 | 17,Dpsr1,5963,23599,2399,8501,266.55555555555554,25.27
42 | 18,Dpsr1,3068,9190,988,3165,988.0,33.38
43 | 19,Dpsr1,3162,12106,951,3900,475.5,26.12
44 | 20,Dpsr1,5727,17304,1841,6325,263.0,33.10
45 | 21,Dpsr1,1982,17201,759,2740,27.107142857142858,11.52


--------------------------------------------------------------------------------
/training/trl/trl/templates/lm_model_card.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | {{ card_data }}
 3 | ---
 4 | 
 5 | # Model Card for {{ model_name }}
 6 | 
 7 | This model is a fine-tuned version of [{{ base_model }}](https://huggingface.co/{{ base_model }}){% if dataset_name %} on the [{{ dataset_name }}](https://huggingface.co/datasets/{{ dataset_name }}) dataset{% endif %}.
 8 | It has been trained using [TRL](https://github.com/huggingface/trl).
 9 | 
10 | ## Quick start
11 | 
12 | ```python
13 | from transformers import pipeline
14 | 
15 | question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
16 | generator = pipeline("text-generation", model="{{ hub_model_id }}", device="cuda")
17 | output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
18 | print(output["generated_text"])
19 | ```
20 | 
21 | ## Training procedure
22 | 
23 | {% if wandb_url %}[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>]({{ wandb_url }}){% endif %} 
24 | {% if comet_url %}[<img src="https://raw.githubusercontent.com/comet-ml/comet-examples/master/logo/comet_badge.png" alt="Visualize in Comet" width="135" height="20"/>]({{ comet_url }}){% endif %}
25 | 
26 | This model was trained with {{ trainer_name }}{% if paper_id %}, a method introduced in [{{ paper_title }}](https://huggingface.co/papers/{{ paper_id }}){% endif %}.
27 | 
28 | ### Framework versions
29 | 
30 | - TRL: {{ trl_version }}
31 | - Transformers: {{ transformers_version }}
32 | - Pytorch: {{ pytorch_version }}
33 | - Datasets: {{ datasets_version }}
34 | - Tokenizers: {{ tokenizers_version }}
35 | 
36 | ## Citations
37 | 
38 | {% if trainer_citation %}Cite {{ trainer_name }} as:
39 | 
40 | ```bibtex
41 | {{ trainer_citation }}
42 | ```{% endif %}
43 | 
44 | Cite TRL as:
45 |     
46 | ```bibtex
47 | {% raw %}@misc{vonwerra2022trl,
48 | 	title        = {{TRL: Transformer Reinforcement Learning}},
49 | 	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
50 | 	year         = 2020,
51 | 	journal      = {GitHub repository},
52 | 	publisher    = {GitHub},
53 | 	howpublished = {\url{https://github.com/huggingface/trl}}
54 | }{% endraw %}
55 | ```
56 | 


--------------------------------------------------------------------------------
/training/trl/docker/trl-latest-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Builds GPU docker image of PyTorch
 2 | # Uses multi-staged approach to reduce size
 3 | # Stage 1
 4 | # Use base conda image to reduce time
 5 | FROM continuumio/miniconda3:latest AS compile-image
 6 | # Specify py version
 7 | ENV PYTHON_VERSION=3.10
 8 | # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 9 | RUN apt-get update && \
10 |     apt-get install -y curl git wget software-properties-common git-lfs && \
11 |     apt-get clean && \
12 |     rm -rf /var/lib/apt/lists*
13 | 
14 | # Install audio-related libraries 
15 | RUN apt-get update && \
16 |     apt install -y ffmpeg
17 | 
18 | RUN apt install -y libsndfile1-dev
19 | RUN git lfs install
20 | 
21 | # Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
22 | RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip
23 | RUN python3 -m pip install --no-cache-dir --upgrade pip
24 | 
25 | # Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
26 | # We don't install pytorch here yet since CUDA isn't available
27 | # instead we use the direct torch wheel
28 | ENV PATH /opt/conda/envs/trl/bin:$PATH
29 | # Activate our bash shell
30 | RUN chsh -s /bin/bash
31 | SHELL ["/bin/bash", "-c"]
32 | 
33 | # Stage 2
34 | FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
35 | COPY --from=compile-image /opt/conda /opt/conda
36 | ENV PATH /opt/conda/bin:$PATH
37 | 
38 | RUN chsh -s /bin/bash
39 | SHELL ["/bin/bash", "-c"]
40 | RUN source activate trl && \ 
41 |     python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
42 | 
43 | # Install apt libs
44 | RUN apt-get update && \
45 |     apt-get install -y curl git wget && \
46 |     apt-get clean && \
47 |     rm -rf /var/lib/apt/lists*
48 | 
49 | # Activate the conda env and install transformers + accelerate from source
50 | RUN source activate trl && \
51 |     python3 -m pip install -U --no-cache-dir \
52 |     librosa \
53 |     "soundfile>=0.12.1" \
54 |     scipy \
55 |     transformers \
56 |     accelerate \
57 |     peft \
58 |     trl[test]@git+https://github.com/huggingface/trl
59 | 
60 | RUN source activate trl && \ 
61 |     pip freeze | grep trl
62 | 
63 | RUN echo "source activate trl" >> ~/.profile
64 | 
65 | # Activate the virtualenv
66 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/training/open-r1/tests/slow/test_code_reward.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import unittest
17 | 
18 | from datasets import load_dataset
19 | 
20 | from open_r1.rewards import code_reward, ioi_code_reward
21 | 
22 | 
23 | class TestCodeRewards(unittest.TestCase):
24 |     def test_python_code_reward(self):
25 |         # requires E2B, see the README.md file
26 |         code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
27 |         NUM_SAMPLES = 20
28 |         samples = code_dataset["train"].select(range(NUM_SAMPLES))
29 |         test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
30 |         reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
31 |         rewards = code_reward(test_completions, **reward_kwargs)
32 |         print(rewards)
33 |         assert rewards == [1.0] * NUM_SAMPLES
34 | 
35 |     def test_ioi_code_reward(self):
36 |         # This slow test case requires spinning up a bunch (I tested with ~64) of piston workers, see docs here
37 |         # slurm/piston/README.md
38 |         code_dataset = load_dataset("open-r1/ioi-reward-test-dataset")
39 |         NUM_SAMPLES = 16
40 |         samples = code_dataset["train"].select(range(NUM_SAMPLES))
41 |         test_completions = [[{"content": f"```cpp\n{sample['sample_solution']}```"}] for sample in samples]
42 |         keys = [key for key in samples[0] if key not in ["prompt", "completion"]]
43 |         reward_kwargs = {key: [example[key] for example in samples] for key in keys}
44 |         rewards = ioi_code_reward(test_completions, **reward_kwargs)
45 |         print(rewards)
46 |         assert rewards == [1.0] * NUM_SAMPLES
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/training/trl/.gitignore:
--------------------------------------------------------------------------------
  1 | *.bak
  2 | .gitattributes
  3 | .last_checked
  4 | .gitconfig
  5 | *.bak
  6 | *.log
  7 | *~
  8 | ~*
  9 | _tmp*
 10 | tmp*
 11 | tags
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | env/
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # dotenv
 95 | .env
 96 | 
 97 | # virtualenv
 98 | .venv
 99 | venv/
100 | ENV/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | 
115 | .vscode
116 | *.swp
117 | 
118 | # osx generated files
119 | .DS_Store
120 | .DS_Store?
121 | .Trashes
122 | ehthumbs.db
123 | Thumbs.db
124 | .idea
125 | 
126 | # pytest
127 | .pytest_cache
128 | 
129 | # tools/trust-doc-nbs
130 | docs_src/.last_checked
131 | 
132 | # symlinks to fastai
133 | docs_src/fastai
134 | tools/fastai
135 | 
136 | # link checker
137 | checklink/cookies.txt
138 | 
139 | # .gitconfig is now autogenerated
140 | .gitconfig
141 | 
142 | # wandb files
143 | nbs/wandb/
144 | examples/notebooks/wandb/
145 | wandb/


--------------------------------------------------------------------------------
/training/open-r1/offload_read_graph.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | def get_bigmodel_mask(text, open_tag="<bigmodel>", close_tag="</bigmodel>"):
 6 |     mask = [0] * len(text)
 7 |     start_index = 0
 8 | 
 9 |     while True:
10 |         open_pos = text.find(open_tag, start_index)
11 |         if open_pos == -1:
12 |             break  # no more openings
13 | 
14 |         close_pos = text.find(close_tag, open_pos + len(open_tag))
15 |         if close_pos == -1:
16 |             # If we can't find a close tag, mark until the end of the text
17 |             for i in range(open_pos, len(text)):
18 |                 mask[i] = 1
19 |             break
20 |         else:
21 |             # Mark the region from <bigmodel> ... </bigmodel>
22 |             region_end = close_pos + len(close_tag)
23 |             for i in range(open_pos, region_end):
24 |                 mask[i] = 1
25 |             start_index = region_end
26 | 
27 |     return mask
28 | 
29 | def main():
30 |     # 1) Load the pickle file
31 |     with open("bigmodel_span_text.pkl", "rb") as f:
32 |         outputs = pickle.load(f)
33 |     # 2) Create a 5x2 figure
34 |     fig, axs = plt.subplots(5, 2, figsize=(14, 10))
35 |     axs = axs.flatten()
36 | 
37 |     # 3) For each of the 10 items in 'outputs', compute mask and plot
38 |     for i in range(10):
39 |         ax = axs[i]
40 |         text = outputs[i]
41 | 
42 |         mask = get_bigmodel_mask(text)
43 | 
44 |         # If the text is empty or mask is empty, skip
45 |         if not text or not mask:
46 |             ax.set_title(f"Example {i} (no content)")
47 |             ax.set_xticks([])
48 |             ax.set_yticks([])
49 |             continue
50 | 
51 |         # x from 0 to 1 across the character range
52 |         x = [k / len(mask) for k in range(len(mask))]
53 |         y = mask  # 0/1 values
54 |         # calculate coverage (percentage of 1s)
55 |         coverage = 100.0 * sum(mask) / len(mask)
56 | 
57 |         # Plot with step
58 |         ax.step(x, y, where='post')
59 |         ax.set_ylim(-0.1, 1.1)  # 0 or 1 only
60 |         ax.set_xlim(0, 1)
61 |         ax.set_yticks([])
62 |         ax.set_title(f"Example {i} ({coverage:.1f}% covered)")
63 | 
64 |     plt.tight_layout()
65 |     plt.savefig("switch_behavior.pdf")
66 |     plt.clf()   # Clear the figure from memory
67 |     plt.close() # Close the plotting window
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/modes/placeholder.py:
--------------------------------------------------------------------------------
 1 | # modes/placeholder.py
 2 | 
 3 | from pprint import pprint
 4 | import os
 5 | import datetime
 6 | 
 7 | 
 8 | def run_placeholder_flow(
 9 |     question,
10 |     big_model,
11 |     big_model_port,
12 |     small_model,
13 |     small_model_port,
14 |     generate_text_vllm,
15 |     max_tokens=1024,
16 |     temperature=0.7,
17 |     test_logging: bool = False,
18 | ):
19 |     """
20 |     A baseline 'placeholder' flow: we just send a single request to the
21 |     *big_model* and return it as a final answer, plus usage data.
22 |     """
23 |     usage_data = []
24 | 
25 |     # Basic prompt
26 |     if "｜" not in question:
27 |         prompt = f"<｜begin▁of▁sentence｜><｜User｜>{question}<｜Assistant｜>"
28 |     else:
29 |         prompt = f"{question}"
30 | 
31 |     print("Sending request to big model")
32 |     # Single big model request
33 |     resp_json, latency_big = generate_text_vllm(
34 |         prompt,
35 |         port=big_model_port,
36 |         temperature=temperature,
37 |         max_tokens=max_tokens,
38 |         model=big_model
39 |     )
40 |     usage_dict_big = resp_json.get("usage", {})
41 |     final_reply = resp_json["choices"][0]["text"]
42 | 
43 |     # Single big model request
44 |     resp_json, latency_small = generate_text_vllm(
45 |         prompt,
46 |         port=small_model_port,
47 |         temperature=temperature,
48 |         max_tokens=max_tokens,
49 |         model=small_model
50 |     )
51 | 
52 |     usage_dict_small = resp_json.get("usage", {})
53 | 
54 |     usage_data.append({
55 |         "Model": big_model,
56 |         "ThinkIter": "placeholder",
57 |         "DraftVersion": 0,
58 |         "PromptTokens": usage_dict_big.get("prompt_tokens", 0),           # Always expect this item.
59 |         "CompletionTokens": usage_dict_big.get("completion_tokens", 0),   # Always expect this item.
60 |         "Latency": latency_big,                                           # Always expect this item.
61 |         "ModelSmall": small_model,
62 |         "PromptTokensSmall": usage_dict_small.get("prompt_tokens", 0),
63 |         "CompletionTokensSmall": usage_dict_small.get("completion_tokens", 0),
64 |         "LatencySmall": latency_small
65 | 
66 |     })
67 | 
68 |     pprint(usage_data)
69 |     final_reply_small = resp_json["choices"][0]["text"]
70 |     print("Final reply from small model:\n\n", final_reply_small)
71 |     print("\n\nFinal reply from big model:\n\n", final_reply)
72 |     return final_reply, usage_data
73 | 


--------------------------------------------------------------------------------
/training/trl/docker/trl-source-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Builds GPU docker image of PyTorch
 2 | # Uses multi-staged approach to reduce size
 3 | # Stage 1
 4 | # Use base conda image to reduce time
 5 | FROM continuumio/miniconda3:latest AS compile-image
 6 | # Specify py version
 7 | ENV PYTHON_VERSION=3.10
 8 | # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 9 | RUN apt-get update && \
10 |     apt-get install -y curl git wget software-properties-common git-lfs && \
11 |     apt-get clean && \
12 |     rm -rf /var/lib/apt/lists*
13 | 
14 | # Install audio-related libraries 
15 | RUN apt-get update && \
16 |     apt install -y ffmpeg
17 | 
18 | RUN apt install -y libsndfile1-dev
19 | RUN git lfs install
20 | 
21 | # Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
22 | RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip
23 | RUN python3 -m pip install --no-cache-dir --upgrade pip
24 | 
25 | # Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
26 | # We don't install pytorch here yet since CUDA isn't available
27 | # instead we use the direct torch wheel
28 | ENV PATH /opt/conda/envs/trl/bin:$PATH
29 | # Activate our bash shell
30 | RUN chsh -s /bin/bash
31 | SHELL ["/bin/bash", "-c"]
32 | 
33 | # Stage 2
34 | FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
35 | COPY --from=compile-image /opt/conda /opt/conda
36 | ENV PATH /opt/conda/bin:$PATH
37 | 
38 | RUN chsh -s /bin/bash
39 | SHELL ["/bin/bash", "-c"]
40 | RUN source activate trl && \ 
41 |     python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
42 | 
43 | # Install apt libs
44 | RUN apt-get update && \
45 |     apt-get install -y curl git wget && \
46 |     apt-get clean && \
47 |     rm -rf /var/lib/apt/lists*
48 | 
49 | # Activate the conda env and install transformers + accelerate from source
50 | RUN source activate trl && \
51 |     python3 -m pip install -U --no-cache-dir \
52 |     librosa \
53 |     "soundfile>=0.12.1" \
54 |     scipy \
55 |     git+https://github.com/huggingface/transformers \
56 |     git+https://github.com/huggingface/accelerate \
57 |     git+https://github.com/huggingface/peft \
58 |     trl[test]@git+https://github.com/huggingface/trl
59 | 
60 | RUN source activate trl && \ 
61 |     pip freeze | grep transformers
62 | 
63 | RUN echo "source activate trl" >> ~/.profile
64 | 
65 | # Activate the virtualenv
66 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/training/trl/tests/test_rich_progress_callback.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import tempfile
16 | import unittest
17 | 
18 | import torch
19 | import torch.nn as nn
20 | from datasets import Dataset
21 | from transformers import Trainer, TrainingArguments
22 | 
23 | from trl.trainer.callbacks import RichProgressCallback
24 | 
25 | 
26 | class DummyModel(nn.Module):
27 |     def __init__(self):
28 |         super().__init__()
29 |         self.a = nn.Parameter(torch.tensor(1.0))
30 | 
31 |     def forward(self, x):
32 |         return self.a * x
33 | 
34 | 
35 | class TestRichProgressCallback(unittest.TestCase):
36 |     def setUp(self):
37 |         self.dummy_model = DummyModel()
38 |         self.dummy_train_dataset = Dataset.from_list([{"x": 1.0, "y": 2.0}] * 5)
39 |         self.dummy_val_dataset = Dataset.from_list([{"x": 1.0, "y": 2.0}] * 101)
40 | 
41 |     def test_rich_progress_callback_logging(self):
42 |         with tempfile.TemporaryDirectory() as tmp_dir:
43 |             training_args = TrainingArguments(
44 |                 output_dir=tmp_dir,
45 |                 per_device_eval_batch_size=2,
46 |                 per_device_train_batch_size=2,
47 |                 num_train_epochs=4,
48 |                 eval_strategy="steps",
49 |                 eval_steps=1,
50 |                 logging_strategy="steps",
51 |                 logging_steps=1,
52 |                 save_strategy="no",
53 |                 report_to="none",
54 |                 disable_tqdm=True,
55 |             )
56 |             callbacks = [RichProgressCallback()]
57 |             trainer = Trainer(
58 |                 model=self.dummy_model,
59 |                 train_dataset=self.dummy_train_dataset,
60 |                 eval_dataset=self.dummy_val_dataset,
61 |                 args=training_args,
62 |                 callbacks=callbacks,
63 |             )
64 | 
65 |             trainer.train()
66 |             trainer.train()
67 | 


--------------------------------------------------------------------------------
/training/trl/examples/research_projects/stack_llama_2/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # DPO pipeline for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | Install all the dependencies in the `requirements.txt`:
 6 | 
 7 | ```
 8 | $ pip install -U -r requirements.txt
 9 | ```
10 | 
11 | Since we will use `accelerate` for training, make sure to run:
12 | ```
13 | $ accelerate config
14 | ```
15 | 
16 | ## Training
17 | 
18 | There were two main steps to the DPO training process:
19 | 1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
20 | 
21 |     ```
22 |     accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \
23 |         --output_dir="./sft" \
24 |         --max_steps=500 \
25 |         --logging_steps=10 \
26 |         --save_steps=10 \
27 |         --per_device_train_batch_size=4 \
28 |         --per_device_eval_batch_size=1 \
29 |         --gradient_accumulation_steps=2 \
30 |         --gradient_checkpointing=False \
31 |         --group_by_length=False \
32 |         --learning_rate=1e-4 \
33 |         --lr_scheduler_type="cosine" \
34 |         --warmup_steps=100 \
35 |         --weight_decay=0.05 \
36 |         --optim="paged_adamw_32bit" \
37 |         --bf16=True \
38 |         --remove_unused_columns=False \
39 |         --run_name="sft_llama2" \
40 |         --report_to="wandb"
41 |     ```
42 | 1. Run the DPO trainer using the model saved by the previous step:
43 |     ```
44 |     accelerate launch examples/research_projects/stack_llama_2/scripts/dpo_llama2.py \
45 |         --model_name_or_path="sft/final_checkpoint" \
46 |         --output_dir="dpo"
47 |     ```
48 | 
49 | 
50 | ## Merging the adaptors
51 | 
52 | To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
53 | 
54 | ```
55 | python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2"
56 | ```
57 | 
58 | which will also push the model to your HuggingFace hub account.
59 | 
60 | ## Running the model
61 | 
62 | We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via:
63 | 
64 | ```py
65 | from peft import AutoPeftModelForCausalLM
66 | 
67 | 
68 | model = AutoPeftModelForCausalLM.from_pretrained(
69 |     "dpo/final_checkpoint",
70 |     low_cpu_mem_usage=True,
71 |     torch_dtype=torch.float16,
72 |     load_in_4bit=True,
73 | )
74 | 
75 | model.generate(...)
76 | ```
77 | 


--------------------------------------------------------------------------------
/training/open-r1/scripts/run_benchmarks.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from dataclasses import dataclass, field
15 | from typing import List, Optional
16 | 
17 | from open_r1.utils.evaluation import SUPPORTED_BENCHMARKS, run_benchmark_jobs
18 | from open_r1.configs import SFTConfig
19 | from trl import ModelConfig, TrlParser
20 | 
21 | 
22 | @dataclass
23 | class ScriptArguments:
24 |     model_id: str = field(
25 |         default="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
26 |         metadata={"help": "The Hub model id to push the model to."},
27 |     )
28 |     model_revision: str = field(default="main", metadata={"help": "The Hub model branch to push the model to."})
29 |     trust_remote_code: bool = field(default=False, metadata={"help": "Trust the remote code."})
30 |     benchmarks: List[str] = field(
31 |         default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
32 |     )
33 |     list_benchmarks: bool = field(default=False, metadata={"help": "List all supported benchmarks."})
34 |     system_prompt: Optional[str] = field(
35 |         default=None, metadata={"help": "The system prompt to use for the benchmark."}
36 |     )
37 | 
38 | 
39 | def main():
40 |     parser = TrlParser(ScriptArguments)
41 |     args = parser.parse_args_and_config()[0]
42 |     if args.list_benchmarks:
43 |         print("Supported benchmarks:")
44 |         for benchmark in SUPPORTED_BENCHMARKS:
45 |             print(f"  - {benchmark}")
46 |         return
47 |     benchmark_args = SFTConfig(
48 |         output_dir="",
49 |         hub_model_id=args.model_id,
50 |         hub_model_revision=args.model_revision,
51 |         benchmarks=args.benchmarks,
52 |         system_prompt=args.system_prompt,
53 |     )
54 |     run_benchmark_jobs(
55 |         benchmark_args,
56 |         ModelConfig(model_name_or_path="", model_revision="", trust_remote_code=args.trust_remote_code),
57 |     )
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/lm_eval_files/aime/README.md:
--------------------------------------------------------------------------------
 1 | # GSM8k
 2 | 
 3 | ## Paper
 4 | Training Verifiers to Solve Math Word Problems
 5 | https://arxiv.org/abs/2110.14168
 6 | 
 7 | State-of-the-art language models can match human performance on many tasks, but
 8 | they still struggle to robustly perform multi-step mathematical reasoning. To
 9 | diagnose the failures of current models and support research, we introduce GSM8K,
10 | a dataset of 8.5K high quality linguistically diverse grade school math word problems.
11 | We find that even the largest transformer models fail to achieve high test performance,
12 | despite the conceptual simplicity of this problem distribution.
13 | 
14 | NOTE: See the official implementation of the task:
15 |     https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
16 | for how to make use of the dataset's calculator annotations in your language
17 | model's sample/generation function.
18 | 
19 | Homepage: https://github.com/openai/grade-school-math
20 | 
21 | 
22 | ## Citation
23 | ```
24 | @misc{cobbe2021training,
25 |       title={Training Verifiers to Solve Math Word Problems},
26 |       author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
27 |       year={2021},
28 |       eprint={2110.14168},
29 |       archivePrefix={arXiv},
30 |       primaryClass={cs.LG}
31 | }
32 | ```
33 | 
34 | ### Groups and Tasks
35 | 
36 | #### Groups
37 | 
38 | - `math_word_problems`
39 | - `chain_of_thought`
40 | - `self_consistency`
41 | 
42 | #### Tasks
43 | 
44 | - `gsm8k_yaml`
45 | - `gsm8k_cot`: GSM8K with Chain-of-Thought
46 | - `gsm8k_cot_self_consistency`: GSM8K with Chain-of-Thought and Self-Consistency
47 | - `gsm8k_cot_llama`: GSM8K with prompt formatting modified to conform to the evaluation settings described by Meta here: https://huggingface.co/datasets/meta-llama/Meta-Llama-3.1-8B-Instruct-evals/viewer/Meta-Llama-3.1-8B-Instruct-evals__gsm8k__details?row=0
48 |     - Use this task with --fewshot_as_multiturn and --apply_chat_template to replicate Meta's reported performance.
49 | 
50 | 
51 | ### Checklist
52 | 
53 | - [x] Is in Eval-harness v1.0 ?
54 | - [ ] Has been checked for regression from v1.0?
55 | - [ ] Has been checked for equivalence with original paper methodology?
56 | - [ ] "Main" checked variant clearly denoted?
57 | 
58 | ### Variant Wishlist
59 | 
60 | - [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation)
61 | - [ ] Using Verifiers
62 | - [ ] Majority voting "without CoT"
63 | 


--------------------------------------------------------------------------------
/lm_eval_files/openai/gpqa_diamond_openai.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: Idavidrein/gpqa
 2 | tag: gpqa
 3 | dataset_name: gpqa_diamond
 4 | task: gpqa_diamond_openai
 5 | output_type: generate_until
 6 | process_docs: !function utils.process_docs
 7 | process_results: !function utils.process_results
 8 | training_split: train
 9 | # Because huggingface dataset only has train split
10 | validation_split: train
11 | test_split: null
12 | # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/common.py#L12
13 | # doc_to_text: "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.  Think step by step before answering Think step by step before answering always provide a final answer within the word limit. \n\n{{Question}}\n\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}"
14 | # doc_to_text: "Answer the following multiple-choice question. Your response must adhere to these rules:
15 | #   1. Think step by step to arrive at the correct answer.
16 | #   2. Avoid repeating reasoning or steps already stated.
17 | #   3. Ensure your response is within the word limit.
18 | #   4. Conclude with the final answer in the format: 'Answer: $LETTER' (without quotes), where LETTER is one of ABCD.
19 |   
20 | #   {{Question}}
21 |   
22 | #   A) {{choice1}}
23 | #   B) {{choice2}}
24 | #   C) {{choice3}}
25 | #   D) {{choice4}}"
26 | # doc_to_text: "{{Question}}\nAnswer Choices: (A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nOutput your final answer in boxes, such as \\boxed{A}"
27 | # Original prompt from Qwq team:
28 | # doc_to_text: "{{Question}}\nAnswer Choices: (A) {{choice1}} (B) {{choice2}} (C) {{choice3}} (D) {{choice4}}\nOutput your final answer in boxes, such as \\boxed{A}."
29 | # doc_to_text: "{{Question}}\n\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}"
30 | doc_to_text: !function utils.doc_to_text_gpqa
31 | doc_to_target: answer
32 | num_fewshot: 0
33 | generation_kwargs:
34 |   until: []
35 |   do_sample: false
36 |   temperature: 0 # Do 0.5? https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L26
37 |   max_gen_toks: 1024 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
38 | metric_list:
39 |   - metric: exact_match
40 |     aggregation: mean
41 |     higher_is_better: true
42 |   - metric: extracted_answers
43 |     aggregation: bypass
44 |     higher_is_better: true
45 | metadata:
46 |   version: 1.0
47 | 


--------------------------------------------------------------------------------
/lm_eval_files/openai/gpqa_diamond_openai_maj64_cov64.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: Idavidrein/gpqa
 2 | tag: gpqa
 3 | dataset_name: gpqa_diamond
 4 | task: gpqa_diamond_openai_maj64_cov64_train
 5 | output_type: generate_until
 6 | process_docs: !function utils.process_docs
 7 | process_results: !function utils.process_results
 8 | training_split: train
 9 | # Because huggingface dataset only has train split
10 | validation_split: train
11 | test_split: null
12 | # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/common.py#L12
13 | doc_to_text: "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering but always provide a final answer within the word limit.\n\n{{Question}}\n\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}"
14 | doc_to_target: answer
15 | num_fewshot: 0
16 | generation_kwargs:
17 |   until: []
18 |   do_sample: false
19 |   temperature: 0.5 # Do 0.5? https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L26
20 |   max_gen_toks: 1024 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27
21 | metric_list:
22 |   - metric: exact_match
23 |     aggregation: mean
24 |     higher_is_better: true
25 | repeats: 64
26 | filter_list:
27 |   - name: "score-first" # pick only the first response, and report metrics on that
28 |     filter:
29 |       - function: "take_first"
30 |   - name: "maj@64"
31 |     filter:
32 |       - function: "majority_vote"
33 |       - function: "take_first"
34 |   - name: "maj@16" # get Maj@16, via selecting the first 8 responses. Using a better estimator would be optimal.
35 |     filter:
36 |       - function: "take_first_k"
37 |         k: 16
38 |       - function: "majority_vote"
39 |       - function: "take_first"
40 |   - name: "maj@8" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal.
41 |     filter:
42 |       - function: "take_first_k"
43 |         k: 8
44 |       - function: "majority_vote"
45 |       - function: "take_first"
46 |   - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator.
47 |     filter:
48 |       - function: "take_first_k"
49 |         k: 64
50 |   - name: "cov@16"
51 |     filter:
52 |       - function: "take_first_k"
53 |         k: 16
54 |   - name: "cov@8"
55 |     filter:
56 |       - function: "take_first_k"
57 |         k: 8
58 | metadata:
59 |   version: 1.0
60 | 


--------------------------------------------------------------------------------
/training/trl/trl/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import TYPE_CHECKING
16 | 
17 | from ..import_utils import OptionalDependencyNotAvailable, _LazyModule, is_diffusers_available
18 | 
19 | 
20 | _import_structure = {
21 |     "modeling_base": ["GeometricMixtureWrapper", "PreTrainedModelWrapper", "create_reference_model"],
22 |     "modeling_value_head": ["AutoModelForCausalLMWithValueHead", "AutoModelForSeq2SeqLMWithValueHead"],
23 |     "utils": [
24 |         "SUPPORTED_ARCHITECTURES",
25 |         "prepare_deepspeed",
26 |         "prepare_fsdp",
27 |         "setup_chat_format",
28 |         "unwrap_model_for_generation",
29 |     ],
30 | }
31 | 
32 | try:
33 |     if not is_diffusers_available():
34 |         raise OptionalDependencyNotAvailable()
35 | except OptionalDependencyNotAvailable:
36 |     pass
37 | else:
38 |     _import_structure["modeling_sd_base"] = [
39 |         "DDPOPipelineOutput",
40 |         "DDPOSchedulerOutput",
41 |         "DDPOStableDiffusionPipeline",
42 |         "DefaultDDPOStableDiffusionPipeline",
43 |     ]
44 | 
45 | if TYPE_CHECKING:
46 |     from .modeling_base import GeometricMixtureWrapper, PreTrainedModelWrapper, create_reference_model
47 |     from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead
48 |     from .utils import (
49 |         SUPPORTED_ARCHITECTURES,
50 |         prepare_deepspeed,
51 |         prepare_fsdp,
52 |         setup_chat_format,
53 |         unwrap_model_for_generation,
54 |     )
55 | 
56 |     try:
57 |         if not is_diffusers_available():
58 |             raise OptionalDependencyNotAvailable()
59 |     except OptionalDependencyNotAvailable:
60 |         pass
61 |     else:
62 |         from .modeling_sd_base import (
63 |             DDPOPipelineOutput,
64 |             DDPOSchedulerOutput,
65 |             DDPOStableDiffusionPipeline,
66 |             DefaultDDPOStableDiffusionPipeline,
67 |         )
68 | else:
69 |     import sys
70 | 
71 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
72 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/judges.md:
--------------------------------------------------------------------------------
 1 | # Judges
 2 | 
 3 | <Tip warning={true}>
 4 | 
 5 | TRL Judges is an experimental API which is subject to change at any time.
 6 | 
 7 | </Tip>
 8 | 
 9 | TRL provides judges to easily compare two completions.
10 | 
11 | Make sure to have installed the required dependencies by running:
12 | 
13 | ```bash
14 | pip install trl[judges]
15 | ```
16 | 
17 | ## Using the provided judges
18 | 
19 | TRL provides several judges out of the box. For example, you can use the `HfPairwiseJudge` to compare two completions using a pre-trained model from the Hugging Face model hub:
20 | 
21 | ```python
22 | from trl import HfPairwiseJudge
23 | 
24 | judge = HfPairwiseJudge()
25 | judge.judge(
26 |     prompts=["What is the capital of France?", "What is the biggest planet in the solar system?"],
27 |     completions=[["Paris", "Lyon"], ["Saturn", "Jupiter"]],
28 | )  # Outputs: [0, 1]
29 | ```
30 | 
31 | ## Define your own judge
32 | 
33 | To define your own judge, we provide several base classes that you can subclass. For rank-based judges, you need to subclass [`BaseRankJudge`] and implement the [`BaseRankJudge.judge`] method. For pairwise judges, you need to subclass [`BasePairJudge`] and implement the [`BasePairJudge.judge`] method. If you want to define a judge that doesn't fit into these categories, you need to subclass [`BaseJudge`] and implement the [`BaseJudge.judge`] method.
34 | 
35 | As an example, let's define a pairwise judge that prefers shorter completions:
36 | 
37 | ```python
38 | from trl import BasePairwiseJudge
39 | 
40 | class PrefersShorterJudge(BasePairwiseJudge):
41 |     def judge(self, prompts, completions, shuffle_order=False):
42 |         return [0 if len(completion[0]) > len(completion[1]) else 1 for completion in completions]
43 | ```
44 | 
45 | You can then use this judge as follows:
46 | 
47 | ```python
48 | judge = PrefersShorterJudge()
49 | judge.judge(
50 |     prompts=["What is the capital of France?", "What is the biggest planet in the solar system?"],
51 |     completions=[["Paris", "The capital of France is Paris."], ["Jupiter is the biggest planet in the solar system.", "Jupiter"]],
52 | )  # Outputs: [0, 1]
53 | ```
54 | 
55 | ## Provided judges
56 | 
57 | ### PairRMJudge
58 | 
59 | [[autodoc]] PairRMJudge
60 | 
61 | ### HfPairwiseJudge
62 | 
63 | [[autodoc]] HfPairwiseJudge
64 | 
65 | ### OpenAIPairwiseJudge
66 | 
67 | [[autodoc]] OpenAIPairwiseJudge
68 | 
69 | ### AllTrueJudge
70 | 
71 | [[autodoc]] AllTrueJudge
72 | 
73 | ## Base classes
74 | 
75 | ### BaseJudge
76 | 
77 | [[autodoc]] BaseJudge
78 | 
79 | ### BaseBinaryJudge
80 | 
81 | [[autodoc]] BaseBinaryJudge
82 | 
83 | ### BaseRankJudge
84 | 
85 | [[autodoc]] BaseRankJudge
86 | 
87 | ### BasePairwiseJudge
88 | 
89 | [[autodoc]] BasePairwiseJudge
90 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/use_model.md:
--------------------------------------------------------------------------------
 1 | # Use model after training
 2 | 
 3 | Once you have trained a model using either the SFTTrainer, PPOTrainer, or DPOTrainer, you will have a fine-tuned model that can be used for text generation. In this section, we'll walk through the process of loading the fine-tuned model and generating text. If you need to run an inference server with the trained model, you can explore libraries such as [`text-generation-inference`](https://github.com/huggingface/text-generation-inference).
 4 | 
 5 | ## Load and Generate
 6 | 
 7 | If you have fine-tuned a model fully, meaning without the use of PEFT you can simply load it like any other language model in transformers. E.g. the value head that was trained during the PPO training is no longer needed and if you load the model with the original transformer class it will be ignored:
 8 | 
 9 | ```python
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | 
12 | model_name_or_path = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub
13 | device = "cpu" # or "cuda" if you have a GPU
14 | 
15 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
16 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
17 | 
18 | inputs = tokenizer.encode("This movie was really", return_tensors="pt").to(device)
19 | outputs = model.generate(inputs)
20 | print(tokenizer.decode(outputs[0]))
21 | ```
22 | 
23 | Alternatively you can also use the pipeline:
24 | 
25 | ```python
26 | from transformers import pipeline
27 | 
28 | model_name_or_path = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub
29 | pipe = pipeline("text-generation", model=model_name_or_path)
30 | print(pipe("This movie was really")[0]["generated_text"])
31 | ```
32 | 
33 | ## Use Adapters PEFT
34 | 
35 | ```python
36 | from peft import PeftConfig, PeftModel
37 | from transformers import AutoModelForCausalLM, AutoTokenizer
38 | 
39 | base_model_name = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub"
40 | adapter_model_name = "path/to/my/adapter"
41 | 
42 | model = AutoModelForCausalLM.from_pretrained(base_model_name)
43 | model = PeftModel.from_pretrained(model, adapter_model_name)
44 | 
45 | tokenizer = AutoTokenizer.from_pretrained(base_model_name)
46 | ```
47 | 
48 | You can also merge the adapters into the base model so you can use the model like a normal transformers model, however the checkpoint will be significantly bigger:
49 | 
50 | ```python
51 | model = AutoModelForCausalLM.from_pretrained(base_model_name)
52 | model = PeftModel.from_pretrained(model, adapter_model_name)
53 | 
54 | model = model.merge_and_unload()
55 | model.save_pretrained("merged_adapters")
56 | ```
57 | 
58 | Once you have the model loaded and either merged the adapters or keep them separately on top you can run generation as with a normal model outlined above.
59 | 


--------------------------------------------------------------------------------
/modes/big_model_only.py:
--------------------------------------------------------------------------------
 1 | # modes/big_model_only.py
 2 | 
 3 | from pprint import pprint
 4 | import os
 5 | import datetime
 6 | 
 7 | import time 
 8 | import uuid
 9 | 
10 | benchfile = "specR_big.csv"
11 | 
12 | def run_bigmodel_flow(
13 |     question,
14 |     big_model,
15 |     big_model_port,
16 |     generate_text_vllm,
17 |     terminating_string: str,
18 |     max_tokens=1024,
19 |     temperature=0.6,
20 |     sequential_scale=0,
21 |     test_logging: bool = False,
22 |     token_counter=None,
23 | ):
24 |     """
25 |     A baseline 'placeholder' flow: we just send a single request to the
26 |     *big_model* and return it as a final answer, plus usage data.
27 |     """
28 |     usage_data = []
29 | 
30 |     model_think_prefix = "<think>\n"
31 |     model_think_suffix = "</think>"
32 | 
33 |     start_time = time.time()
34 |     def _clean(t):  # strip special markers
35 |         for s in ("<｜User｜>", "<｜Assistant｜>", "<｜begin▁of▁sentence｜>",
36 |                   "<｜end▁of▁sentence｜>", "<think>"):
37 |             t = t.replace(s, "")
38 |         return t
39 | 
40 |     sequential_iter = 0 # Remove sequential-iter support
41 |     if sequential_iter == 0:
42 |         big_hint = ""
43 |         term_str = "\n Put your final answer within \\boxed{}."
44 |         cur = (f"<｜begin▁of▁sentence｜><｜User｜>{_clean(question)}\n"
45 |             f"{big_hint}{term_str}<｜Assistant｜>\n<think>\n")
46 |         prompt = cur
47 |         
48 |     resp_json, latency = generate_text_vllm(
49 |         prompt,
50 |         port=big_model_port,
51 |         temperature=temperature,
52 |         max_tokens=8192,
53 |         model=big_model
54 |     )
55 |     final_reply = resp_json["choices"][0]["text"]
56 |     # final_reply = f"{prompt}{final_reply}"
57 |     final_reply = f"{final_reply}"
58 |     total_time = time.time() - start_time
59 |     total_tokens = token_counter(final_reply) if token_counter else len(final_reply.split())
60 |     time_per_tok = total_time / total_tokens if total_tokens > 0 else 0
61 |     uuid_ = str(uuid.uuid4())
62 |     current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63 |     try:
64 |         if not os.path.exists(benchfile):
65 |             with open(benchfile, "w") as f:
66 |                 f.write(
67 |                     "uuid,big_model,sequential_scale,total_tokens,"
68 |                     "total_time,time_per_tok,datetime\n"
69 |                 )
70 |         with open(benchfile, "a") as f:
71 |             f.write(
72 |                 f"{uuid_},{big_model},{sequential_scale},"
73 |                 f"{total_tokens},{total_time},{time_per_tok},{current_time}\n"
74 |             )
75 |     except Exception as e:
76 |         print(f"Error writing to file: {e}")
77 |         print("Please check if the file path is correct and if you have write permissions.")
78 |         pass
79 |     return final_reply, usage_data
80 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/distributing_training.md:
--------------------------------------------------------------------------------
 1 | # Distributing Training
 2 | 
 3 | <Tip warning={true}>
 4 | Section under construction. Feel free to contribute!
 5 | </Tip>
 6 | 
 7 | ## Multi-GPU Training with TRL
 8 | 
 9 | The trainers in TRL use [🤗 Accelerate](https://github.com/huggingface/accelerate) to enable distributed training across multiple GPUs or nodes. To do so, first create an [🤗 Accelerate](https://github.com/huggingface/accelerate) config file by running
10 | 
11 | ```bash
12 | accelerate config
13 | ```
14 | 
15 | and answering the questions according to your multi-GPU / multi-node setup. You can then launch distributed training by running:
16 | 
17 | ```bash
18 | accelerate launch train.py
19 | ```
20 | 
21 | We also provide config files in the [examples folder](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.:
22 | 
23 | ```shell
24 | accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train.py <SCRIPT_ARGS>
25 | ```
26 | 
27 | This automatically distributes the workload across all available GPUs.
28 | 
29 | Under the hood, [🤗 Accelerate](https://github.com/huggingface/accelerate) creates one model per GPU. Each process:
30 | - Processes its own batch of data
31 | - Computes the loss and gradients for that batch
32 | - Shares gradient updates across all GPUs
33 | 
34 | ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)
35 | 
36 | The effective batch size is calculated as:
37 | 
38 | $$
39 | \text{Batch Size} = \text{per\_device\_train\_batch\_size} \times \text{num\_devices} \times \text{gradient\_accumulation\_steps}
40 | $$
41 | 
42 | To maintain a consistent batch size when scaling to multiple GPUs, make sure to update `per_device_train_batch_size` and `gradient_accumulation_steps` accordingly.
43 | 
44 | Example, these configurations are equivalent, and should yield the same results:
45 | 
46 | | Number of GPUs | Per device batch size | Gradient accumulation steps | Comments |
47 | | --- | --- | --- | --- |
48 | | 1 | 32 | 1 | Possibly high memory usage, but faster training |
49 | | 1 | 4 | 8 | Lower memory usage, slower training |
50 | | 8 | 4 | 1 | Multi-GPU to get the best of both worlds |
51 | 
52 | <Tip> 
53 | 
54 | Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage [DeepSpeed](https://github.com/deepspeedai/DeepSpeed), which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our [DeepSpeed Integration](deepspeed_integration.md) guide for more details.
55 | 
56 | </Tip>
57 | 
58 | ## Multi-Nodes Training
59 | 
60 | We're working on a guide for multi-node training. Stay tuned! 🚀


--------------------------------------------------------------------------------
/training/trl/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass, field
16 | from typing import Optional
17 | 
18 | import torch
19 | from peft import PeftConfig, PeftModel
20 | from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
21 | 
22 | 
23 | @dataclass
24 | class ScriptArguments:
25 |     """
26 |     The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
27 |     merged model.
28 |     """
29 | 
30 |     adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
31 |     base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
32 |     output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})
33 | 
34 | 
35 | parser = HfArgumentParser(ScriptArguments)
36 | script_args = parser.parse_args_into_dataclasses()[0]
37 | assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
38 | assert script_args.base_model_name is not None, "please provide the name of the Base model"
39 | assert script_args.output_name is not None, "please provide the output name of the merged model"
40 | 
41 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
42 | if peft_config.task_type == "SEQ_CLS":
43 |     # The sequence classification task is used for the reward model in PPO
44 |     model = AutoModelForSequenceClassification.from_pretrained(
45 |         script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16
46 |     )
47 | else:
48 |     model = AutoModelForCausalLM.from_pretrained(
49 |         script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16
50 |     )
51 | 
52 | tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)
53 | 
54 | # Load the PEFT model
55 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
56 | model.eval()
57 | 
58 | model = model.merge_and_unload()
59 | 
60 | model.save_pretrained(f"{script_args.output_name}")
61 | tokenizer.save_pretrained(f"{script_args.output_name}")
62 | model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
63 | 


--------------------------------------------------------------------------------
/modes/small_model_only.py:
--------------------------------------------------------------------------------
 1 | # modes/small_model_only.py
 2 | 
 3 | from pprint import pprint
 4 | import os
 5 | import datetime
 6 | import time 
 7 | import uuid
 8 | 
 9 | benchfile = "specR_small.csv"
10 | 
11 | def run_smallmodel_flow(
12 |     question,
13 |     small_model,
14 |     small_model_port,
15 |     generate_text_vllm,
16 |     terminating_string: str,
17 |     max_tokens=1024,
18 |     temperature=0.6,
19 |     test_logging: bool = False,
20 |     sequential_scale=0,
21 |     token_counter=None
22 | ):
23 |     """
24 |     A baseline 'placeholder' flow: we just send a single request to the
25 |     *small_model* and return it as a final answer, plus usage data.
26 |     """
27 |     usage_data = []
28 | 
29 |     model_think_prefix = "<think>\n"
30 |     model_think_suffix = "</think>"
31 | 
32 |     bigmodel_str = "You always use <bigmodel>...</bigmodel> to mark parts of the reasoning process that are important."
33 |     start_time = time.time()
34 |     def _clean(t):
35 |         for s in ("<｜User｜>", "<｜Assistant｜>", "<｜begin▁of▁sentence｜>",
36 |                   "<｜end▁of▁sentence｜>", "<think>"):
37 |             t = t.replace(s, "")
38 |         return t
39 | 
40 |     sequential_iter = 0 # Remove sequential-iter support
41 |     if sequential_iter == 0:
42 |         big_hint = ""
43 |         term_str = "\n Put your final answer within \\boxed{}."
44 |         cur = (f"<｜begin▁of▁sentence｜><｜User｜>{_clean(question)}\n"
45 |             f"{big_hint}{term_str}<｜Assistant｜>\n<think>\n")
46 |         prompt = cur
47 |     
48 |         # prompt[:len(prompt)//2],
49 |     resp_json, latency = generate_text_vllm(
50 |         prompt,
51 |         port=small_model_port,
52 |         temperature=temperature,
53 |         max_tokens=8192,
54 |         model=small_model
55 |     )
56 |     final_reply = resp_json["choices"][0]["text"]
57 |     final_reply_small = f"{final_reply}"
58 |     total_time = time.time() - start_time
59 |     total_tokens = token_counter(final_reply_small) if token_counter else len(final_reply_small.split())
60 |     time_per_tok = total_time / total_tokens if total_tokens > 0 else 0
61 |     uuid_ = str(uuid.uuid4())
62 |     current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63 | 
64 |     try:
65 |         if not os.path.exists(benchfile):
66 |             with open(benchfile, "w") as f:
67 |                 f.write(
68 |                     "uuid,small_model,sequential_scale,total_tokens,"
69 |                     "total_time,time_per_tok,datetime\n"
70 |                 )
71 |         with open(benchfile, "a") as f:
72 |             f.write(
73 |                 f"{uuid_},{small_model},{sequential_scale},"
74 |                 f"{total_tokens},{total_time},{time_per_tok},{current_time}\n"
75 |             )
76 |     except Exception as e:
77 |         print(f"Error writing to file: {e}")
78 |         print("Please check if the file path is correct and if you have write permissions.")
79 |         pass
80 |     return final_reply_small, usage_data
81 | 


--------------------------------------------------------------------------------
/training/trl/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F41B Bug Report"
 2 | description: Submit a bug report to help us improve TRL
 3 | labels: [ "bug" ]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for taking the time to fill out this bug report! 🤗
 9 | 
10 |         🚩 If it is your first time submitting, be sure to check our [bug report guidelines](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md#did-you-find-a-bug)
11 | 
12 |   - type: textarea
13 |     id: reproduction
14 |     validations:
15 |       required: true
16 |     attributes:
17 |       label: Reproduction
18 |       description: |
19 |         Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
20 |         If you have code snippets, error messages, stack traces please provide them here as well.
21 |         Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
22 |         Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
23 | 
24 |       value: |
25 |         ```python
26 |         from trl import ...
27 | 
28 |         ```
29 | 
30 |         outputs:
31 | 
32 |         ```
33 |         Traceback (most recent call last):
34 |           File "example.py", line 42, in <module>
35 |             ...
36 |         ```
37 | 
38 |   - type: textarea
39 |     id: system-info
40 |     attributes:
41 |       label: System Info
42 |       description: |
43 |         Please provide information about your system: platform, Python version, PyTorch version, Transformers version, devices, TRL version, ...
44 |         You can get this information by running `trl env` in your terminal.
45 | 
46 |       placeholder: Copy-paste the output of `trl env`
47 |     validations:
48 |       required: true
49 | 
50 |   - type: checkboxes
51 |     id: terms
52 |     attributes:
53 |       label: Checklist
54 |       description: |
55 |         Before submitting, please confirm that you've completed each of the following.
56 |         If an item doesn't apply to your issue, check it anyway to show you've reviewed it.
57 |       options:
58 |         - label: "I have checked that my issue isn't already filed (see [open issues](https://github.com/huggingface/trl/issues?q=is%3Aissue))"
59 |           required: true
60 |         - label: "I have included my system information"
61 |           required: true
62 |         - label: "Any code provided is minimal, complete, and reproducible ([more on MREs](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks))"
63 |           required: true
64 |         - label: "Any code provided is properly formatted in code blocks, (no screenshot, [more on code blocks](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks))"
65 |           required: true
66 |         - label: "Any traceback provided is complete"
67 |           required: true
68 | 


--------------------------------------------------------------------------------
/training/trl/tests/test_modeling_geometric_mixture_wrapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import unittest
16 | 
17 | import torch
18 | from transformers import AutoModelForCausalLM, GenerationConfig
19 | 
20 | from trl.models.modeling_base import GeometricMixtureWrapper, create_reference_model
21 | 
22 | 
23 | class TestGeometricMixtureWrapper(unittest.TestCase):
24 |     def setUp(self):
25 |         model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
26 |         self.model = AutoModelForCausalLM.from_pretrained(model_id)
27 |         self.ref_model = create_reference_model(self.model)
28 |         self.generation_config = GenerationConfig.from_pretrained(model_id)
29 |         self.mixture_coef = 0.5
30 |         self.wrapper = GeometricMixtureWrapper(
31 |             self.model, self.ref_model, self.generation_config, mixture_coef=self.mixture_coef
32 |         )
33 | 
34 |     def test_forward(self):
35 |         input_ids = torch.tensor([[1, 2, 3, 4, 5]])
36 |         attention_mask = torch.ones_like(input_ids)
37 | 
38 |         output = self.wrapper(input_ids=input_ids, attention_mask=attention_mask)
39 | 
40 |         self.assertIsNotNone(output)
41 |         self.assertTrue(hasattr(output, "logits"))
42 |         self.assertEqual(output.logits.shape, (1, 5, self.model.config.vocab_size))
43 | 
44 |     def test_mixture_coefficient(self):
45 |         input_ids = torch.tensor([[1, 2, 3, 4, 5]])
46 |         attention_mask = torch.ones_like(input_ids)
47 | 
48 |         with torch.no_grad():
49 |             model_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
50 |             ref_model_output = self.ref_model(input_ids=input_ids, attention_mask=attention_mask)
51 |             wrapper_output = self.wrapper(input_ids=input_ids, attention_mask=attention_mask)
52 | 
53 |         expected_logits = torch.nn.functional.log_softmax(
54 |             self.mixture_coef * ref_model_output.logits + (1 - self.mixture_coef) * model_output.logits, dim=-1
55 |         )
56 | 
57 |         self.assertTrue(torch.allclose(wrapper_output.logits, expected_logits, atol=1e-5))
58 | 
59 |     def test_prepare_inputs_for_generation(self):
60 |         input_ids = torch.tensor([[1, 2, 3, 4, 5]])
61 |         attention_mask = torch.ones_like(input_ids)
62 | 
63 |         inputs = self.wrapper.prepare_inputs_for_generation(input_ids, attention_mask=attention_mask, use_cache=True)
64 | 
65 |         self.assertIn("input_ids", inputs)
66 |         self.assertIn("attention_mask", inputs)
67 |         self.assertFalse(inputs.get("use_cache", False))
68 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/sentiment_tuning.md:
--------------------------------------------------------------------------------
 1 | # Sentiment Tuning Examples
 2 | 
 3 | The notebooks and scripts in this examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`).
 4 | 
 5 | Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):
 6 | 
 7 | 
 8 | 
 9 | | File                                                                                           | Description                                                                                                              |
10 | |------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
11 | | [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
12 | | [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb)              | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.                |
13 | | [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb)                | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.    
14 | 
15 | 
16 | 
17 | ## Usage
18 | 
19 | ```bash
20 | # 1. run directly
21 | python examples/scripts/ppo.py
22 | # 2. run via `accelerate` (recommended), enabling more features (e.g., multiple GPUs, deepspeed)
23 | accelerate config # will prompt you to define the training configuration
24 | accelerate launch examples/scripts/ppo.py # launches training
25 | # 3. get help text and documentation
26 | python examples/scripts/ppo.py --help
27 | # 4. configure logging with wandb and, say, mini_batch_size=1 and gradient_accumulation_steps=16
28 | python examples/scripts/ppo.py --log_with wandb --mini_batch_size 1 --gradient_accumulation_steps 16
29 | ```
30 | 
31 | Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).
32 | 
33 | 
34 | ## Few notes on multi-GPU 
35 | 
36 | To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.


--------------------------------------------------------------------------------
/training/trl/docs/source/best_of_n.md:
--------------------------------------------------------------------------------
 1 | # Best of N sampling: Alternative ways to get better model output without RL based fine-tuning 
 2 | 
 3 | Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output.
 4 | As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example
 5 | 
 6 | ## Usage
 7 | 
 8 | To get started quickly, instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a proxy reward pipeline that outputs reward scores for input queries
 9 | 
10 | ```python
11 | 
12 | from transformers import pipeline, AutoTokenizer
13 | from trl import AutoModelForCausalLMWithValueHead
14 | from trl.core import LengthSampler
15 | from trl.extras import BestOfNSampler
16 | 
17 | ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)
18 | reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device)
19 | tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
20 | tokenizer.pad_token = tokenizer.eos_token
21 | 
22 | 
23 | # callable that takes a list of raw text and returns a list of corresponding reward scores
24 | def queries_to_scores(list_of_strings):
25 |   return [output["score"] for output in reward_pipe(list_of_strings)]
26 | 
27 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler)
28 | 
29 | 
30 | ```
31 | 
32 | And assuming you have a list/tensor of tokenized queries, you can generate better output by calling the `generate` method
33 | 
34 | ```python
35 | 
36 | best_of_n.generate(query_tensors, device=device, **gen_kwargs)
37 | 
38 | ```
39 | The default sample size is 4, but you can change it at the time of instance initialization like so
40 | 
41 | ```python
42 | 
43 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, sample_size=8)
44 | 
45 | ```
46 | 
47 | The default output is the result of taking the top scored output for each query, but you can change it to top 2 and so on by passing the `n_candidates` argument at the time of instance initialization
48 | 
49 | ```python
50 | 
51 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, n_candidates=2)
52 | 
53 | ```
54 | 
55 | There is the option of setting the generation settings (like `temperature`, `pad_token_id`) at the time of instance creation as opposed to when calling the `generate` method.
56 | This is done by passing a `GenerationConfig` from the `transformers` library at the time of initialization
57 | 
58 | ```python
59 | 
60 | from transformers import GenerationConfig
61 | 
62 | generation_config = GenerationConfig(min_length= -1, top_k=0.0, top_p= 1.0, do_sample= True, pad_token_id=tokenizer.eos_token_id)
63 | 
64 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, generation_config=generation_config)
65 | 
66 | best_of_n.generate(query_tensors, device=device)
67 | 
68 | ```
69 | 
70 | Furthermore, at the time of initialization you can set the seed to control the repeatability of the generation process and the number of samples to generate for each query
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/training/trl/.github/workflows/slow-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Slow tests (on push)
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |     paths:
 7 |       # Run only when python files are modified
 8 |       - "trl/**.py"
 9 |       - "examples/**.py"
10 | env:
11 |   RUN_SLOW: "yes"
12 |   IS_GITHUB_CI: "1"
13 |   SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
14 | 
15 | 
16 | jobs:
17 |   run_all_tests_single_gpu:
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"]
22 |     runs-on:
23 |       group: aws-g4dn-2xlarge
24 |     env:
25 |       CUDA_VISIBLE_DEVICES: "0"
26 |       TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}"
27 |     container:
28 |       image: ${{ matrix.docker-image-name }}
29 |       options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
30 |     defaults:
31 |       run:
32 |         shell: bash
33 |     steps:
34 |       - uses: actions/checkout@v4
35 |       - name: Pip install
36 |         run: |
37 |           source activate trl
38 |           pip install -e ".[test]" --no-deps
39 |           pip install pytest-reportlog parameterized
40 | 
41 |       - name: Run slow SFT tests on single GPU
42 |         if: always()
43 |         run: |
44 |           source activate trl
45 |           make slow_tests
46 |       
47 |       - name: Generate Report
48 |         if: always()
49 |         run: |
50 |           pip install slack_sdk tabulate
51 |           python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
52 | 
53 | 
54 |   run_all_tests_multi_gpu:
55 |     strategy:
56 |       fail-fast: false
57 |       matrix:
58 |         docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"]
59 |     runs-on:
60 |       group: aws-g4dn-2xlarge
61 |     env:
62 |       CUDA_VISIBLE_DEVICES: "0,1"
63 |       TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}"
64 |     container:
65 |       image: ${{ matrix.docker-image-name }}
66 |       options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
67 |     defaults:
68 |       run:
69 |         shell: bash
70 |     steps:
71 |       - uses: actions/checkout@v4
72 |       - name: Pip install
73 |         run: |
74 |           source activate trl
75 |           pip install -e ".[test]" --no-deps
76 |           pip install pytest-reportlog parameterized
77 | 
78 |       - name: Run slow SFT tests on Multi GPU
79 |         if: always()
80 |         run: |
81 |           source activate trl
82 |           make slow_tests
83 | 
84 |       - name: Run end-to-end examples tests on multi GPU
85 |         if: always()
86 |         run: |
87 |           source activate trl
88 |           pip install deepspeed
89 |           make test_examples
90 |       
91 |       - name: Generate Reports
92 |         if: always()
93 |         run: |
94 |           pip install slack_sdk tabulate
95 |           python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
96 |           python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY
97 |           python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY
98 |           rm *.txt
99 | 


--------------------------------------------------------------------------------
/training/trl/trl/extras/profiling.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import contextlib
16 | import functools
17 | import time
18 | from typing import Generator
19 | 
20 | from transformers import Trainer, is_wandb_available
21 | 
22 | 
23 | if is_wandb_available():
24 |     import wandb
25 | 
26 | 
27 | @contextlib.contextmanager
28 | def profiling_context(trainer: Trainer, name: str) -> Generator[None, None, None]:
29 |     """
30 |     A context manager function for profiling a block of code. Results are logged to Weights & Biases if enabled.
31 | 
32 |     Args:
33 |         trainer (`~transformers.Trainer`):
34 |             Trainer object.
35 |         name (`str`):
36 |             Name of the block to be profiled. Used as a key in the logged dictionary.
37 | 
38 |     Example:
39 |     ```python
40 |     from transformers import Trainer
41 |     from trl.extras.profiling import profiling_context
42 | 
43 |     class MyTrainer(Trainer):
44 |         def some_method(self):
45 |             A = np.random.rand(1000, 1000)
46 |             B = np.random.rand(1000, 1000)
47 |             with profiling_context(self, "matrix_multiplication"):
48 |                 # Code to profile: simulate a computationally expensive operation
49 |                 result = A @ B  # Matrix multiplication
50 |     ```
51 |     """
52 |     start_time = time.perf_counter()
53 |     yield
54 |     end_time = time.perf_counter()
55 |     duration = end_time - start_time
56 | 
57 |     if "wandb" in trainer.args.report_to and wandb.run is not None and trainer.accelerator.is_main_process:
58 |         wandb.log({f"profiling/Time taken: {trainer.__class__.__name__}.{name}": duration})
59 | 
60 | 
61 | def profiling_decorator(func: callable) -> callable:
62 |     """
63 |     Decorator to profile a function and log execution time using [`extras.profiling.profiling_context`].
64 | 
65 |     Args:
66 |         func (`callable`):
67 |             Function to be profiled.
68 | 
69 |     Example:
70 |     ```python
71 |     from transformers import Trainer
72 |     from trl.extras.profiling import profiling_decorator
73 | 
74 |     class MyTrainer(Trainer):
75 |         @profiling_decorator
76 |         def some_method(self):
77 |             A = np.random.rand(1000, 1000)
78 |             B = np.random.rand(1000, 1000)
79 |             # Code to profile: simulate a computationally expensive operation
80 |             result = A @ B
81 |     ```
82 |     """
83 | 
84 |     @functools.wraps(func)
85 |     def wrapper(self, *args, **kwargs):
86 |         with profiling_context(self, func.__name__):
87 |             return func(self, *args, **kwargs)
88 | 
89 |     return wrapper
90 | 


--------------------------------------------------------------------------------
/training/open-r1/recipes/DeepSeek-R1-Distill-Qwen-1.5B/sft/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
 9 | dataset_name: akhauriyash/OpenR1_Math_SpeculativeReasoning
10 | dataset_num_proc: 48
11 | 
12 | # SFT trainer config
13 | bf16: true
14 | do_eval: false
15 | eval_strategy: 'no'
16 | gradient_accumulation_steps: 1
17 | gradient_checkpointing: true
18 | gradient_checkpointing_kwargs:
19 |   use_reentrant: false
20 | hub_model_id: DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoner
21 | hub_strategy: every_save
22 | learning_rate: 5.0e-05
23 | log_level: info
24 | logging_steps: 1
25 | logging_strategy: steps
26 | lr_scheduler_type: cosine_with_min_lr
27 | lr_scheduler_kwargs:
28 |   min_lr_rate: 0.1
29 | packing: true
30 | max_length: 16384
31 | max_steps: -1
32 | num_train_epochs: 3
33 | output_dir: data/DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoner
34 | overwrite_output_dir: true
35 | per_device_eval_batch_size: 8
36 | per_device_train_batch_size: 8
37 | push_to_hub: true
38 | report_to:
39 | - wandb
40 | save_strategy: "steps"
41 | save_steps: 50
42 | save_total_limit: 1
43 | seed: 42
44 | use_liger: true
45 | use_liger_kernel: true
46 | warmup_ratio: 0.05
47 | 


--------------------------------------------------------------------------------
/training/trl/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
 1 | name: Build Docker images (scheduled)
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   workflow_call:
 6 |   schedule:
 7 |     - cron: "0 1 * * *"
 8 | 
 9 | concurrency:
10 |   group: docker-image-builds
11 |   cancel-in-progress: false
12 | 
13 | env:
14 |   CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
15 | 
16 | jobs:
17 |   trl-latest:
18 |     name: "Latest TRL GPU"
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - name: Cleanup disk
22 |         run: |
23 |           sudo ls -l /usr/local/lib/
24 |           sudo ls -l /usr/share/
25 |           sudo du -sh /usr/local/lib/
26 |           sudo du -sh /usr/share/
27 |           sudo rm -rf /usr/local/lib/android
28 |           sudo rm -rf /usr/share/dotnet
29 |           sudo du -sh /usr/local/lib/
30 |           sudo du -sh /usr/share/
31 |       - name: Set up Docker Buildx
32 |         uses: docker/setup-buildx-action@v1
33 |       - name: Check out code
34 |         uses: actions/checkout@v4
35 |       - name: Login to DockerHub
36 |         uses: docker/login-action@v1
37 |         with:
38 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
39 |           password: ${{ secrets.DOCKERHUB_PASSWORD }}
40 | 
41 |       - name: Build and Push GPU
42 |         uses: docker/build-push-action@v4
43 |         with:
44 |           context: ./docker/trl-latest-gpu
45 |           push: true
46 |           tags: huggingface/trl-latest-gpu
47 | 
48 |       - name: Post to Slack
49 |         if: always()
50 |         uses: huggingface/hf-workflows/.github/actions/post-slack@main
51 |         with:
52 |           slack_channel: ${{ env.CI_SLACK_CHANNEL }}
53 |           title: 🤗 Results of the trl-latest-gpu Docker Image build
54 |           status: ${{ job.status }}
55 |           slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
56 | 
57 |   trl-source:
58 |     name: "Latest TRL + HF ecosystem from source"
59 |     runs-on: ubuntu-latest
60 |     steps:
61 |       - name: Cleanup disk
62 |         run: |
63 |           sudo ls -l /usr/local/lib/
64 |           sudo ls -l /usr/share/
65 |           sudo du -sh /usr/local/lib/
66 |           sudo du -sh /usr/share/
67 |           sudo rm -rf /usr/local/lib/android
68 |           sudo rm -rf /usr/share/dotnet
69 |           sudo du -sh /usr/local/lib/
70 |           sudo du -sh /usr/share/
71 |       - name: Set up Docker Buildx
72 |         uses: docker/setup-buildx-action@v1
73 |       - name: Check out code
74 |         uses: actions/checkout@v4
75 |       - name: Login to DockerHub
76 |         uses: docker/login-action@v1
77 |         with:
78 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
79 |           password: ${{ secrets.DOCKERHUB_PASSWORD }}
80 | 
81 |       - name: Build and Push GPU
82 |         uses: docker/build-push-action@v4
83 |         with:
84 |           context: ./docker/trl-source-gpu
85 |           push: true
86 |           tags: huggingface/trl-source-gpu
87 | 
88 |       - name: Post to Slack
89 |         if: always()
90 |         uses: huggingface/hf-workflows/.github/actions/post-slack@main
91 |         with:
92 |           slack_channel: ${{ env.CI_SLACK_CHANNEL }}
93 |           title: 🤗 Results of the trl-source-gpu Docker Image build
94 |           status: ${{ job.status }}
95 |           slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}  
96 | 


--------------------------------------------------------------------------------
/training/trl/docs/source/_toctree.yml:
--------------------------------------------------------------------------------
  1 | - sections:
  2 |   - local: index
  3 |     title: TRL
  4 |   - local: installation
  5 |     title: Installation
  6 |   - local: quickstart
  7 |     title: Quickstart
  8 |   title: Getting started
  9 | - sections:
 10 |   - local: dataset_formats
 11 |     title: Dataset Formats
 12 |   - local: how_to_train
 13 |     title: Training FAQ
 14 |   - local: logging
 15 |     title: Understanding Logs
 16 |   title: Conceptual Guides
 17 | - sections:
 18 |   - local: clis
 19 |     title: Command Line Interface (CLI)
 20 |   - local: customization
 21 |     title: Customizing the Training
 22 |   - local: reducing_memory_usage
 23 |     title: Reducing Memory Usage
 24 |   - local: speeding_up_training
 25 |     title: Speeding Up Training
 26 |   - local: distributing_training
 27 |     title: Distributing Training
 28 |   - local: use_model
 29 |     title: Using Trained Models
 30 |   title: How-to guides
 31 | - sections:
 32 |   - local: deepspeed_integration
 33 |     title: DeepSpeed
 34 |   - local: liger_kernel_integration
 35 |     title: Liger Kernel
 36 |   - local: peft_integration
 37 |     title: PEFT
 38 |   - local: unsloth_integration
 39 |     title: Unsloth
 40 |   title: Integrations
 41 | - sections:
 42 |   - local: example_overview
 43 |     title: Example Overview
 44 |   - local: community_tutorials
 45 |     title: Community Tutorials
 46 |   - local: sentiment_tuning
 47 |     title: Sentiment Tuning
 48 |   - local: using_llama_models
 49 |     title: Training StackLlama
 50 |   - local: detoxifying_a_lm
 51 |     title: Detoxifying a Language Model
 52 |   - local: learning_tools
 53 |     title: Learning to Use Tools
 54 |   - local: multi_adapter_rl
 55 |     title: Multi Adapter RLHF
 56 |   title: Examples
 57 | - sections:
 58 |   - sections: # Sorted alphabetically
 59 |     - local: alignprop_trainer
 60 |       title: AlignProp
 61 |     - local: bco_trainer
 62 |       title: BCO
 63 |     - local: cpo_trainer
 64 |       title: CPO
 65 |     - local: ddpo_trainer
 66 |       title: DDPO
 67 |     - local: dpo_trainer
 68 |       title: DPO
 69 |     - local: online_dpo_trainer
 70 |       title: Online DPO
 71 |     - local: gkd_trainer
 72 |       title: GKD
 73 |     - local: grpo_trainer
 74 |       title: GRPO
 75 |     - local: kto_trainer
 76 |       title: KTO
 77 |     - local: nash_md_trainer
 78 |       title: Nash-MD
 79 |     - local: orpo_trainer
 80 |       title: ORPO
 81 |     - local: ppo_trainer
 82 |       title: PPO
 83 |     - local: prm_trainer
 84 |       title: PRM
 85 |     - local: reward_trainer
 86 |       title: Reward
 87 |     - local: rloo_trainer
 88 |       title: RLOO
 89 |     - local: sft_trainer
 90 |       title: SFT
 91 |     - local: iterative_sft_trainer
 92 |       title: Iterative SFT
 93 |     - local: xpo_trainer
 94 |       title: XPO
 95 |     title: Trainers
 96 |   - local: models
 97 |     title: Model Classes
 98 |   - local: best_of_n
 99 |     title: Best of N Sampling
100 |   - local: judges
101 |     title: Judges
102 |   - local: callbacks
103 |     title: Callbacks
104 |   - local: data_utils
105 |     title: Data Utilities
106 |   - local: text_environments
107 |     title: Text Environments
108 |   - local: script_utils
109 |     title: Script Utilities
110 |   - local: others
111 |     title: Others
112 |   title: API
113 | 


--------------------------------------------------------------------------------
/training/open-r1/slurm/train.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=open-r1-sft
 3 | #SBATCH --ntasks-per-node=1
 4 | #SBATCH --exclusive
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --partition=hopper-prod  # Adjust this for your cluster
 7 | #SBATCH --output=./logs/%x-%j.out
 8 | #SBATCH --err=./logs/%x-%j.err
 9 | #SBATCH --requeue
10 | 
11 | # Specific configuration optimized for the Hugging Face Compute Cluster
12 | module load cuda/12.4
13 | set -x -e
14 | 
15 | source ~/.bashrc
16 | source openr1/bin/activate
17 | echo "START TIME: $(date)"
18 | 
19 | MODEL=$1
20 | TASK=$2
21 | CONFIG_SUFFIX=$3
22 | ACCELERATOR=$4
23 | OPTIONAL_ARGS=$5
24 | CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
25 | GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
26 | MODEL=$(grep 'model_name_or_path:' $CONFIG_FILE | awk '{print $2}')
27 | REVISION=$(grep 'model_revision:' $CONFIG_FILE | head -n 1 | awk '{print $2}')
28 | 
29 | # Distributed configuration
30 | NUM_NODES=$SLURM_NNODES
31 | GPUS_PER_NODE=8
32 | WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
33 | NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
34 | MASTER_ADDR=${NODELIST[0]}  # First node for main process
35 | MASTER_PORT=6000
36 | TRAIN_NODES=("${NODELIST[@]}")
37 | 
38 | USE_VLLM="false"
39 | if [[ -f "$CONFIG_FILE" ]] && grep -qE '^\s*use_vllm:\s*true' "$CONFIG_FILE"; then
40 |     USE_VLLM="true"
41 | fi
42 | # if using vllm
43 | if [[ "$USE_VLLM" == "true" ]]; then
44 |      TRAIN_NODES=("${NODELIST[@]:0:$((NUM_NODES - 1))}")
45 |      VLLM_NODE=${NODELIST[-1]} # Last node
46 |      TP=$(python scripts/get_tensor_parallel_size.py --model_name $MODEL --revision $REVISION --default_tp $GPUS_PER_NODE)
47 |      WORLD_SIZE=$((WORLD_SIZE - GPUS_PER_NODE))
48 |      NUM_NODES=$((NUM_NODES - 1))
49 |      srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP &
50 | 
51 |      OPTIONAL_ARGS="$OPTIONAL_ARGS --vllm_server_host=$VLLM_NODE"
52 | fi
53 | 
54 | # force crashing on nccl issues like hanging broadcast
55 | export NCCL_ASYNC_ERROR_HANDLING=1
56 | # export NCCL_DEBUG=INFO
57 | # export NCCL_DEBUG_SUBSYS=COLL
58 | # export NCCL_SOCKET_NTHREADS=1
59 | # export NCCL_NSOCKS_PERTHREAD=1
60 | # export CUDA_LAUNCH_BLOCKING=1
61 | 
62 | export CMD=" \
63 |     src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
64 |     "
65 | 
66 | export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
67 |     --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
68 |     --gradient_accumulation_steps $GRAD_ACC_STEPS \
69 |     --num_machines $NUM_NODES \
70 |     --num_processes $WORLD_SIZE \
71 |     --main_process_ip $MASTER_ADDR \
72 |     --main_process_port $MASTER_PORT \
73 |     --machine_rank $SLURM_PROCID \
74 |     --rdzv_backend=c10d \
75 |     --max_restarts 1 \
76 |     --role \$(hostname -s): \
77 |     --tee 3 \
78 |     "
79 | # srun error handling:
80 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
81 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
82 | SRUN_ARGS=" \
83 |     --wait=60 \
84 |     --kill-on-bad-exit=1 \
85 |     --nodes=$NUM_NODES \
86 |     --ntasks=$NUM_NODES \
87 |     --nodelist=$TRAIN_NODES
88 |     "
89 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
90 | 
91 | echo "END TIME: $(date)"


--------------------------------------------------------------------------------