├── modes ├── __init__.py ├── speculative_decoding.py ├── placeholder.py ├── big_model_only.py └── small_model_only.py ├── training ├── open-r1 │ ├── tests │ │ ├── __init__.py │ │ └── slow │ │ │ └── test_code_reward.py │ ├── assets │ │ └── plan-of-attack.png │ ├── src │ │ └── open_r1 │ │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── ioi │ │ │ │ ├── __init__.py │ │ │ │ └── utils.py │ │ │ ├── wandb_logging.py │ │ │ ├── import_utils.py │ │ │ └── model_utils.py │ │ │ └── __init__.py │ ├── .github │ │ ├── dependabot.yml │ │ └── workflows │ │ │ └── tests.yml │ ├── recipes │ │ ├── accelerate_configs │ │ │ ├── ddp.yaml │ │ │ ├── zero2.yaml │ │ │ ├── zero3.yaml │ │ │ └── fsdp.yaml │ │ ├── README.md │ │ ├── Qwen2.5-1.5B-Instruct │ │ │ ├── sft │ │ │ │ └── config_demo.yaml │ │ │ └── grpo │ │ │ │ ├── config_demo.yaml │ │ │ │ ├── config_demo_code.yaml │ │ │ │ └── config_demo_code_ioi.yaml │ │ ├── OlympicCoder-7B │ │ │ └── sft │ │ │ │ └── config_v00.00.yaml │ │ ├── Mistral-Small-24B-Instruct-2501 │ │ │ └── sft │ │ │ │ └── config_openr1_math.yaml │ │ ├── OpenR1-Qwen-7B │ │ │ └── sft │ │ │ │ └── config.yaml │ │ ├── SmolLM2-1.7B │ │ │ └── sft │ │ │ │ └── config.yaml │ │ ├── SmolLM2-1.7B-Instruct │ │ │ └── sft │ │ │ │ └── config.yaml │ │ ├── OlympicCoder-32B │ │ │ └── sft │ │ │ │ └── config_v00.00.yaml │ │ ├── Qwen2.5-Math-7B │ │ │ └── grpo │ │ │ │ └── config_simple_rl.yaml │ │ ├── Qwen2.5-7B-Instruct │ │ │ └── grpo │ │ │ │ └── config_demo.yaml │ │ └── DeepSeek-R1-Distill-Qwen-1.5B │ │ │ └── sft │ │ │ └── config_demo.yaml │ ├── slurm │ │ ├── piston │ │ │ ├── launch_piston_workers.sh │ │ │ └── launch_single_piston.sh │ │ ├── README.md │ │ ├── serve_router.slurm │ │ └── train.slurm │ ├── setup.cfg │ ├── base_training.sh │ ├── scripts │ │ ├── get_tensor_parallel_size.py │ │ ├── upload_details.py │ │ └── run_benchmarks.py │ ├── Makefile │ └── offload_read_graph.py └── trl │ ├── setup.cfg │ ├── requirements.txt │ ├── examples │ ├── README.md │ ├── research_projects │ │ ├── stack_llama_2 │ │ │ └── scripts │ │ │ │ ├── requirements.txt │ │ │ │ └── README.md │ │ ├── toxicity │ │ │ └── README.md │ │ ├── README.md │ │ └── stack_llama │ │ │ └── scripts │ │ │ ├── README.md │ │ │ └── merge_peft_adapter.py │ ├── accelerate_configs │ │ ├── single_gpu.yaml │ │ ├── multi_gpu.yaml │ │ ├── deepspeed_zero1.yaml │ │ ├── deepspeed_zero2.yaml │ │ ├── deepspeed_zero3.yaml │ │ └── fsdp_qlora.yaml │ ├── cli_configs │ │ └── example_config.yaml │ ├── notebooks │ │ └── README.md │ └── scripts │ │ ├── dpo.py │ │ ├── sft.py │ │ └── sft_gemma3.py │ ├── docs │ └── source │ │ ├── unsloth_integration.md │ │ ├── liger_kernel_integration.md │ │ ├── others.md │ │ ├── script_utils.md │ │ ├── callbacks.md │ │ ├── data_utils.md │ │ ├── installation.md │ │ ├── models.md │ │ ├── iterative_sft_trainer.md │ │ ├── deepspeed_integration.md │ │ ├── judges.md │ │ ├── use_model.md │ │ ├── distributing_training.md │ │ ├── sentiment_tuning.md │ │ ├── best_of_n.md │ │ └── _toctree.yml │ ├── MANIFEST.in │ ├── .github │ ├── workflows │ │ ├── trufflehog.yml │ │ ├── issue_auto_labeller.yml │ │ ├── upload_pr_documentation.yml │ │ ├── build_documentation.yml │ │ ├── build_pr_documentation.yml │ │ ├── codeQL.yml │ │ ├── clear_cache.yml │ │ ├── tests_latest.yml │ │ ├── slow-tests.yml │ │ └── docker-build.yml │ ├── codeql │ │ └── custom-queries.qls │ ├── ISSUE_TEMPLATE │ │ ├── feature-request.yml │ │ ├── new-trainer-addition.yml │ │ └── bug-report.yml │ └── PULL_REQUEST_TEMPLATE.md │ ├── .pre-commit-config.yaml │ ├── pyproject.toml │ ├── tests │ ├── __init__.py │ ├── slow │ │ ├── __init__.py │ │ └── testing_constants.py │ ├── testing_constants.py │ ├── test_core.py │ ├── test_rich_progress_callback.py │ └── test_modeling_geometric_mixture_wrapper.py │ ├── trl │ ├── extras │ │ ├── __init__.py │ │ └── profiling.py │ ├── environment │ │ └── __init__.py │ ├── scripts │ │ └── __init__.py │ ├── trainer │ │ ├── xpo_config.py │ │ └── nash_md_config.py │ ├── templates │ │ └── lm_model_card.md │ └── models │ │ └── __init__.py │ ├── Makefile │ ├── CITATION.cff │ ├── commands │ ├── run_sft.sh │ └── run_dpo.sh │ ├── docker │ ├── trl-latest-gpu │ │ └── Dockerfile │ └── trl-source-gpu │ │ └── Dockerfile │ └── .gitignore ├── figs ├── image.png ├── image_seq_parl.png └── accuracy_to_latency_teaser_main.png ├── annotated_dataset ├── switch_behavior.pdf ├── dataset_analysis.pdf ├── normalized_offloading_deepseek.png ├── offload_percentage_by_qid_deepseek.png ├── strlen_analysis.py ├── hf_Dataset │ └── proprocess.py └── annotation_statistics.csv ├── requirements.txt ├── lm_eval_files ├── aime │ ├── aime25_nofigures.yaml │ ├── aime_figures.yaml │ ├── aime_nofigures.yaml │ ├── aime24_nofigures.yaml │ ├── aime24_figures.yaml │ ├── aime_2024_agg8.yaml │ ├── aime_2024_rebase.yaml │ ├── aime25_nofigures_agg64.yaml │ ├── aime24_figures_agg64.yaml │ ├── aime24_nofigures_agg64.yaml │ ├── aime25_nofigures_maj8cov8.yaml │ ├── aime24_nofigures_maj8cov8.yaml │ └── README.md ├── vllm_speculative_init.py ├── openai_math │ ├── openai_math.yaml │ ├── openai_math_train.yaml │ ├── openai_math_cov64.yaml │ ├── openai_math_cov64_train.yaml │ ├── openai_math_maj64_cov64_train.yaml │ ├── openai_math_agg64.yaml │ └── openai_math_maj64_cov64.yaml └── openai │ ├── gpqa_diamond_openai.yaml │ └── gpqa_diamond_openai_maj64_cov64.yaml └── logging_config.json /modes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /training/open-r1/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /training/trl/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_file = LICENSE -------------------------------------------------------------------------------- /training/trl/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets 3 | rich 4 | transformers>=4.46.0 -------------------------------------------------------------------------------- /figs/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/figs/image.png -------------------------------------------------------------------------------- /figs/image_seq_parl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/figs/image_seq_parl.png -------------------------------------------------------------------------------- /annotated_dataset/switch_behavior.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/switch_behavior.pdf -------------------------------------------------------------------------------- /annotated_dataset/dataset_analysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/dataset_analysis.pdf -------------------------------------------------------------------------------- /figs/accuracy_to_latency_teaser_main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/figs/accuracy_to_latency_teaser_main.png -------------------------------------------------------------------------------- /training/open-r1/assets/plan-of-attack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/training/open-r1/assets/plan-of-attack.png -------------------------------------------------------------------------------- /training/trl/examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples. -------------------------------------------------------------------------------- /annotated_dataset/normalized_offloading_deepseek.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/normalized_offloading_deepseek.png -------------------------------------------------------------------------------- /training/trl/examples/research_projects/stack_llama_2/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | trl 3 | peft 4 | accelerate 5 | datasets 6 | bitsandbytes 7 | wandb 8 | -------------------------------------------------------------------------------- /annotated_dataset/offload_percentage_by_qid_deepseek.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdelfattah-lab/SplitReason/HEAD/annotated_dataset/offload_percentage_by_qid_deepseek.png -------------------------------------------------------------------------------- /training/trl/docs/source/unsloth_integration.md: -------------------------------------------------------------------------------- 1 | # Unsloth Integration 2 | 3 | 4 | 5 | Section under construction. Feel free to contribute! 6 | 7 | -------------------------------------------------------------------------------- /training/trl/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | include trl/templates/*.md -------------------------------------------------------------------------------- /training/trl/docs/source/liger_kernel_integration.md: -------------------------------------------------------------------------------- 1 | # Liger Kernel Integration 2 | 3 | 4 | 5 | Section under construction. Feel free to contribute! 6 | 7 | -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .import_utils import is_e2b_available 2 | from .model_utils import get_tokenizer 3 | 4 | 5 | __all__ = ["get_tokenizer", "is_e2b_available"] 6 | -------------------------------------------------------------------------------- /training/trl/docs/source/others.md: -------------------------------------------------------------------------------- 1 | # Other 2 | 3 | ## profiling_decorator 4 | 5 | [[autodoc]] extras.profiling.profiling_decorator 6 | 7 | ## profiling_context 8 | 9 | [[autodoc]] extras.profiling.profiling_context 10 | -------------------------------------------------------------------------------- /training/open-r1/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | - package-ecosystem: "github-actions" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | -------------------------------------------------------------------------------- /training/trl/docs/source/script_utils.md: -------------------------------------------------------------------------------- 1 | # Scripts Utilities 2 | 3 | ## ScriptArguments 4 | 5 | [[autodoc]] ScriptArguments 6 | 7 | ## TrlParser 8 | 9 | [[autodoc]] TrlParser 10 | - parse_args_and_config 11 | - parse_args_into_dataclasses 12 | - set_defaults_with_config 13 | -------------------------------------------------------------------------------- /training/trl/examples/research_projects/toxicity/README.md: -------------------------------------------------------------------------------- 1 | # De-detoxifying language models 2 | 3 | To run this code, do the following: 4 | 5 | ```shell 6 | ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb 7 | ``` 8 | -------------------------------------------------------------------------------- /training/trl/.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Secret Leaks 5 | 6 | jobs: 7 | trufflehog: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v4 12 | with: 13 | fetch-depth: 0 14 | - name: Secret Scanning 15 | uses: trufflesecurity/trufflehog@main 16 | -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/utils/ioi/__init__.py: -------------------------------------------------------------------------------- 1 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints 2 | from .scoring import SubtaskResult, score_subtask 3 | from .utils import add_includes 4 | 5 | 6 | __all__ = [ 7 | "get_piston_client_from_env", 8 | "get_slurm_piston_endpoints", 9 | "score_subtask", 10 | "add_includes", 11 | "SubtaskResult", 12 | ] 13 | -------------------------------------------------------------------------------- /training/trl/.github/workflows/issue_auto_labeller.yml: -------------------------------------------------------------------------------- 1 | name: "Hugging Face Issue Labeler" 2 | on: 3 | issues: 4 | types: opened 5 | 6 | jobs: 7 | triage: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | issues: write 11 | steps: 12 | - uses: actions/checkout@v3 13 | - uses: August-murr/auto-labeler@main 14 | with: 15 | hf-api-key: ${{ secrets.CI_HF_API_TOKEN }} 16 | -------------------------------------------------------------------------------- /training/trl/docs/source/callbacks.md: -------------------------------------------------------------------------------- 1 | # Callbacks 2 | 3 | ## SyncRefModelCallback 4 | 5 | [[autodoc]] SyncRefModelCallback 6 | 7 | ## RichProgressCallback 8 | 9 | [[autodoc]] RichProgressCallback 10 | 11 | ## WinRateCallback 12 | 13 | [[autodoc]] WinRateCallback 14 | 15 | ## LogCompletionsCallback 16 | 17 | [[autodoc]] LogCompletionsCallback 18 | 19 | ## MergeModelCallback 20 | 21 | [[autodoc]] MergeModelCallback -------------------------------------------------------------------------------- /training/open-r1/recipes/accelerate_configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /training/trl/examples/accelerate_configs/single_gpu.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: "NO" 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: 'bf16' 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /training/trl/examples/accelerate_configs/multi_gpu.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: 'bf16' 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /training/open-r1/recipes/README.md: -------------------------------------------------------------------------------- 1 | # Post-training recipes 2 | 3 | ## OlympicCoder 4 | 5 | To train the OlympicCoder models, run: 6 | 7 | ``` 8 | # 7B 9 | sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3 10 | 11 | # 32B 12 | sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp 13 | ``` 14 | 15 | Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size. -------------------------------------------------------------------------------- /training/trl/.github/workflows/upload_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Upload PR Documentation 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Build PR Documentation"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | build: 11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main 12 | with: 13 | package_name: trl 14 | secrets: 15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 16 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} -------------------------------------------------------------------------------- /training/trl/examples/cli_configs/example_config.yaml: -------------------------------------------------------------------------------- 1 | # This is an example configuration file of TRL CLI, you can use it for 2 | # SFT like that: `trl sft --config config.yaml --output_dir test-sft` 3 | # The YAML file supports environment variables by adding an `env` field 4 | # as below 5 | 6 | # env: 7 | # CUDA_VISIBLE_DEVICES: 0 8 | 9 | model_name_or_path: 10 | Qwen/Qwen2.5-0.5B 11 | dataset_name: 12 | stanfordnlp/imdb 13 | report_to: 14 | none 15 | learning_rate: 16 | 0.0001 17 | lr_scheduler_type: 18 | cosine 19 | -------------------------------------------------------------------------------- /training/trl/.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - doc-builder* 8 | - v*-release 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.sha }} 15 | package: trl 16 | version_tag_suffix: "" 17 | custom_container: huggingface/transformers-doc-builder 18 | secrets: 19 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 20 | -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/utils/wandb_logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def init_wandb_training(training_args): 5 | """ 6 | Helper function for setting up Weights & Biases logging tools. 7 | """ 8 | if training_args.wandb_entity is not None: 9 | os.environ["WANDB_ENTITY"] = training_args.wandb_entity 10 | if training_args.wandb_project is not None: 11 | os.environ["WANDB_PROJECT"] = training_args.wandb_project 12 | if training_args.wandb_run_group is not None: 13 | os.environ["WANDB_RUN_GROUP"] = training_args.wandb_run_group 14 | -------------------------------------------------------------------------------- /training/trl/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.9.7 4 | hooks: 5 | - id: ruff 6 | types_or: [ python, pyi ] 7 | args: [ --fix ] 8 | - id: ruff-format 9 | types_or: [ python, pyi ] 10 | 11 | # - repo: https://github.com/codespell-project/codespell 12 | # rev: v2.1.0 13 | # hooks: 14 | # - id: codespell 15 | # args: 16 | # - --ignore-words-list=nd,reacher,thist,ths,magent,ba 17 | # - --skip=docs/css/termynal.css,docs/js/termynal.js 18 | -------------------------------------------------------------------------------- /training/trl/examples/accelerate_configs/deepspeed_zero1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | gradient_accumulation_steps: 1 6 | zero3_init_flag: false 7 | zero_stage: 1 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: 'no' 10 | machine_rank: 0 11 | main_training_function: main 12 | mixed_precision: 'bf16' 13 | num_machines: 1 14 | num_processes: 8 15 | rdzv_backend: static 16 | same_network: true 17 | tpu_env: [] 18 | tpu_use_cluster: false 19 | tpu_use_sudo: false 20 | use_cpu: false 21 | -------------------------------------------------------------------------------- /training/open-r1/recipes/accelerate_configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /training/trl/.github/workflows/build_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.event.pull_request.head.sha }} 15 | pr_number: ${{ github.event.number }} 16 | package: trl 17 | version_tag_suffix: "" 18 | custom_container: huggingface/transformers-doc-builder 19 | -------------------------------------------------------------------------------- /training/trl/examples/accelerate_configs/deepspeed_zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: 'bf16' 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==24.1.0 2 | aiohttp==3.11.13 3 | datasets==3.5.0 4 | distilabel==1.5.3 5 | e2b_code_interpreter==1.2.0 6 | Flask==3.1.0 7 | hf_transfer==0.1.9 8 | httpx==0.28.1 9 | huggingface_hub==0.30.2 10 | latex2sympy2_extended==1.10.1 11 | lighteval==0.8.1 12 | lm_eval==0.4.8 13 | math_verify==0.7.0 14 | matplotlib==3.10.1 15 | more_itertools==10.6.0 16 | numpy 17 | pytablewriter==1.2.1 18 | python-dotenv==1.1.0 19 | Requests==2.32.3 20 | scipy==1.15.2 21 | setuptools==75.8.0 22 | spacy==3.8.5 23 | tabulate==0.9.0 24 | torch==2.6.0 25 | tqdm==4.67.1 26 | transformers==4.51.2 27 | trl==0.16.1 28 | uvloop==0.21.0 29 | vllm==0.8.3 -------------------------------------------------------------------------------- /training/open-r1/recipes/accelerate_configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /training/trl/examples/accelerate_configs/deepspeed_zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /training/trl/examples/research_projects/README.md: -------------------------------------------------------------------------------- 1 | # Research projects that use TRL 2 | 3 | Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information! 4 | 5 | - [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity) 6 | - [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama) 7 | - [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2) -------------------------------------------------------------------------------- /training/trl/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | target-version = "py37" 3 | line-length = 119 4 | 5 | [tool.ruff.lint] 6 | ignore = [ 7 | "B028", # warning without explicit stacklevel 8 | "C408", # dict() calls (stylistic) 9 | "C901", # function complexity 10 | "E501", 11 | ] 12 | extend-select = ["E", "F", "I", "W", "UP", "B", "T", "C"] 13 | 14 | [tool.ruff.lint.per-file-ignores] 15 | # Allow prints in auxiliary scripts 16 | "examples/**.py" = ["T201"] 17 | "scripts/**.py" = ["T201"] 18 | # Ignore import violations in all `__init__.py` files. 19 | "__init__.py" = ["F401"] 20 | 21 | [tool.ruff.lint.isort] 22 | lines-after-imports = 2 23 | known-first-party = ["trl"] 24 | -------------------------------------------------------------------------------- /training/trl/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /training/trl/tests/slow/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /training/open-r1/slurm/piston/launch_piston_workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this simple script will launch a bunch of piston workers on the HF science cluster 4 | 5 | N_INSTANCES=${1:-5} # Default to 5 instances 6 | 7 | for i in $(seq 1 $N_INSTANCES); do 8 | # Find random (hopefully) available port 9 | PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1) 10 | 11 | # the job name format is important for the code to then be able to get a list of workers. `piston-worker-` 12 | sbatch \ 13 | --job-name="piston-worker-$PORT" \ 14 | --export=ALL,PORT=$PORT \ 15 | slurm/piston/launch_single_piston.sh 16 | done -------------------------------------------------------------------------------- /lm_eval_files/aime/aime25_nofigures.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime25_nofigures 4 | dataset_path: TIGER-Lab/AIME25 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 32768 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | - metric: extracted_answers 22 | aggregation: bypass 23 | higher_is_better: true 24 | metadata: 25 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/aime/aime_figures.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime_figures 4 | dataset_path: simplescaling/aime_figures 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 32768 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | - metric: extracted_answers 22 | aggregation: bypass 23 | higher_is_better: true 24 | metadata: 25 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/vllm_speculative_init.py: -------------------------------------------------------------------------------- 1 | from . import ( 2 | anthropic_llms, 3 | api_models, 4 | dummy, 5 | gguf, 6 | hf_vlms, 7 | huggingface, 8 | mamba_lm, 9 | nemo_lm, 10 | neuralmagic, 11 | neuron_optimum, 12 | openai_completions, 13 | optimum_lm, 14 | textsynth, 15 | vllm_causallms, 16 | vllm_speculative, 17 | vllm_vlms, 18 | ) 19 | 20 | 21 | # TODO: implement __all__ 22 | 23 | 24 | try: 25 | # enable hf hub transfer if available 26 | import hf_transfer # type: ignore # noqa 27 | import huggingface_hub.constants # type: ignore 28 | 29 | huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True 30 | except ImportError: 31 | pass 32 | -------------------------------------------------------------------------------- /lm_eval_files/aime/aime_nofigures.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime_nofigures 4 | dataset_path: simplescaling/aime_nofigures 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 32768 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | - metric: extracted_answers 22 | aggregation: bypass 23 | higher_is_better: true 24 | metadata: 25 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/aime/aime24_nofigures.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime24_nofigures 4 | dataset_path: simplescaling/aime24_nofigures 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 32768 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | - metric: extracted_answers 22 | aggregation: bypass 23 | higher_is_better: true 24 | metadata: 25 | version: 1.0 -------------------------------------------------------------------------------- /training/trl/.github/workflows/codeQL.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL Analysis - Workflows" 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | analyze: 8 | name: "Analyze GitHub Workflows" 9 | runs-on: ubuntu-latest 10 | permissions: 11 | security-events: write 12 | actions: read 13 | contents: read 14 | 15 | steps: 16 | - name: "Checkout repository" 17 | uses: actions/checkout@v4 18 | 19 | - name: "Initialize CodeQL" 20 | uses: github/codeql-action/init@v2 21 | with: 22 | languages: "yaml" 23 | queries: +security-and-quality, ./.github/codeql/custom-queries.qls 24 | 25 | - name: "Perform CodeQL Analysis" 26 | uses: github/codeql-action/analyze@v2 27 | -------------------------------------------------------------------------------- /lm_eval_files/aime/aime24_figures.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime24_figures 4 | dataset_path: simplescaling/aime24_figures 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 32768 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | - metric: exact_match_aime24 22 | aggregation: mean_last30 23 | higher_is_better: true 24 | - metric: extracted_answers 25 | aggregation: bypass 26 | higher_is_better: true 27 | metadata: 28 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: openai_math 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: test 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: [] 13 | do_sample: false 14 | temperature: 0 15 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 16 | metric_list: 17 | - metric: exact_match 18 | aggregation: mean 19 | higher_is_better: true 20 | - metric: extracted_answers 21 | aggregation: bypass 22 | higher_is_better: true 23 | metadata: 24 | version: 1.0 -------------------------------------------------------------------------------- /training/trl/tests/testing_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__" 16 | CI_HUB_USER_FULL_NAME = "Dummy User" 17 | 18 | CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co" 19 | -------------------------------------------------------------------------------- /training/trl/examples/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebooks 2 | 3 | This directory contains a collection of Jupyter notebooks that demonstrate how to use the TRL library in different applications. 4 | 5 | - [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO. 6 | - [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. 7 | - [`gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. 8 | -------------------------------------------------------------------------------- /training/open-r1/.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - v*-release 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | 14 | tests: 15 | name: Run tests and quality checks 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | - name: Setup Python environment 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: 3.10.10 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install ".[quality,tests]" 28 | - name: Code quality 29 | run: | 30 | make quality 31 | - name: Run tests 32 | run: | 33 | make test 34 | 35 | -------------------------------------------------------------------------------- /training/open-r1/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math_train.yaml: -------------------------------------------------------------------------------- 1 | group: 2 | - math_word_problems 3 | task: openai_math_train 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: train 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: 13 | - "Problem:" 14 | skip_special_tokens: false 15 | do_sample: false 16 | temperature: 0.0 17 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 18 | metric_list: 19 | - metric: exact_match 20 | aggregation: mean 21 | higher_is_better: true 22 | - metric: extracted_answers 23 | aggregation: bypass 24 | higher_is_better: true 25 | metadata: 26 | version: 1.0 -------------------------------------------------------------------------------- /training/trl/docs/source/data_utils.md: -------------------------------------------------------------------------------- 1 | # Data Utilities 2 | 3 | ## is_conversational 4 | 5 | [[autodoc]] is_conversational 6 | 7 | ## apply_chat_template 8 | 9 | [[autodoc]] apply_chat_template 10 | 11 | ## maybe_apply_chat_template 12 | 13 | [[autodoc]] maybe_apply_chat_template 14 | 15 | ## maybe_convert_to_chatml 16 | 17 | [[autodoc]] maybe_convert_to_chatml 18 | 19 | ## extract_prompt 20 | 21 | [[autodoc]] extract_prompt 22 | 23 | ## maybe_extract_prompt 24 | 25 | [[autodoc]] maybe_extract_prompt 26 | 27 | ## unpair_preference_dataset 28 | 29 | [[autodoc]] unpair_preference_dataset 30 | 31 | ## maybe_unpair_preference_dataset 32 | 33 | [[autodoc]] maybe_unpair_preference_dataset 34 | 35 | ## pack_examples 36 | 37 | [[autodoc]] pack_examples 38 | 39 | ## pack_dataset 40 | 41 | [[autodoc]] pack_dataset 42 | 43 | ## truncate_dataset 44 | 45 | [[autodoc]] truncate_dataset 46 | -------------------------------------------------------------------------------- /training/open-r1/recipes/accelerate_configs/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610 8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 9 | fsdp_backward_prefetch: BACKWARD_PRE 10 | fsdp_cpu_ram_efficient_loading: true 11 | fsdp_forward_prefetch: true 12 | fsdp_offload_params: false 13 | fsdp_sharding_strategy: FULL_SHARD 14 | fsdp_state_dict_type: FULL_STATE_DICT 15 | fsdp_sync_module_states: true 16 | fsdp_use_orig_params: true 17 | machine_rank: 0 18 | main_training_function: main 19 | mixed_precision: bf16 20 | num_machines: 1 21 | num_processes: 8 22 | rdzv_backend: static 23 | same_network: true 24 | tpu_env: [] 25 | tpu_use_cluster: false 26 | tpu_use_sudo: false 27 | use_cpu: false -------------------------------------------------------------------------------- /training/open-r1/base_training.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### TRAINING SFT SCRIPT ##### 4 | 5 | ACCELERATE_LOG_LEVEL=info MASTER_PORT=29501 accelerate launch --main_process_port 29502 --config_file recipes/accelerate_configs/zero3.yaml \ 6 | src/open_r1/sft.py \ 7 | --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/sft/config_demo.yaml --wandb_project SpeculativeReasoning --run_name DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoning 8 | 9 | ##### GRPO SCRIPT ##### 10 | 11 | CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model akhauriyash/DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoner 12 | 13 | CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \ 14 | accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes 7 \ 15 | src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml --wandb_project SpeculativeReasoning --run_name DeepSeek-R1-Distill-Qwen-1.5B-GRPO-SpeculativeReasoner 16 | 17 | 18 | -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/utils/import_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers.utils.import_utils import _is_package_available 16 | 17 | 18 | # Use same as transformers.utils.import_utils 19 | _e2b_available = _is_package_available("e2b") 20 | 21 | 22 | def is_e2b_available() -> bool: 23 | return _e2b_available 24 | -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math_cov64.yaml: -------------------------------------------------------------------------------- 1 | group: 2 | - math_word_problems 3 | task: openai_math_cov64 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: test 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: [] 13 | do_sample: true 14 | temperature: 0.5 15 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 16 | metric_list: 17 | - metric: exact_match 18 | aggregation: mean 19 | higher_is_better: true 20 | repeats: 64 21 | filter_list: 22 | - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator. 23 | filter: 24 | - function: "take_first_k" 25 | k: 64 26 | metadata: 27 | version: 1.0 -------------------------------------------------------------------------------- /logging_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "formatters": { 4 | "default": { 5 | "class": "vllm.logging_utils.NewLineFormatter", 6 | "datefmt": "%m-%d %H:%M:%S", 7 | "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" 8 | } 9 | }, 10 | "handlers": { 11 | "console_error": { 12 | "class": "logging.StreamHandler", 13 | "formatter": "default", 14 | "level": "ERROR", 15 | "stream": "ext://sys.stdout" 16 | }, 17 | "console_info": { 18 | "class": "logging.StreamHandler", 19 | "formatter": "default", 20 | "level": "INFO", 21 | "stream": "ext://sys.stdout" 22 | } 23 | }, 24 | "loggers": { 25 | "vllm": { 26 | "handlers": ["console_error"], 27 | "level": "ERROR", 28 | "propagate": false 29 | }, 30 | "vllm.metrics": { 31 | "handlers": ["console_info"], 32 | "level": "INFO", 33 | "propagate": false 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /training/trl/examples/scripts/dpo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ############################################################################################### 16 | # This file has been moved to https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py # 17 | ############################################################################################### 18 | -------------------------------------------------------------------------------- /training/trl/examples/scripts/sft.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ############################################################################################### 16 | # This file has been moved to https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py # 17 | ############################################################################################### 18 | -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math_cov64_train.yaml: -------------------------------------------------------------------------------- 1 | group: 2 | - math_word_problems 3 | task: openai_math_cov64_train 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: train 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: 13 | - "Problem:" 14 | do_sample: true 15 | temperature: 0.5 16 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | repeats: 64 22 | filter_list: 23 | - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator. 24 | filter: 25 | - function: "take_first_k" 26 | k: 64 27 | metadata: 28 | version: 1.0 -------------------------------------------------------------------------------- /training/trl/.github/workflows/clear_cache.yml: -------------------------------------------------------------------------------- 1 | name: "Cleanup Cache" 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * *" 7 | 8 | jobs: 9 | cleanup: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out code 13 | uses: actions/checkout@v4 14 | 15 | - name: Cleanup 16 | run: | 17 | gh extension install actions/gh-actions-cache 18 | 19 | REPO=${{ github.repository }} 20 | 21 | echo "Fetching list of cache key" 22 | cacheKeysForPR=$(gh actions-cache list -R $REPO | cut -f 1 ) 23 | 24 | ## Setting this to not fail the workflow while deleting cache keys. 25 | set +e 26 | echo "Deleting caches..." 27 | for cacheKey in $cacheKeysForPR 28 | do 29 | gh actions-cache delete $cacheKey -R $REPO --confirm 30 | done 31 | echo "Done" 32 | env: 33 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | -------------------------------------------------------------------------------- /training/trl/docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | You can install TRL either from PyPI or from source: 3 | 4 | ## PyPI 5 | Install the library with pip or [uv](https://docs.astral.sh/uv/): 6 | 7 | 8 | 9 | 10 | uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), . 11 | 12 | ```bash 13 | uv pip install trl 14 | ``` 15 | 16 | 17 | 18 | 19 | ```bash 20 | pip install trl 21 | ``` 22 | 23 | 24 | 25 | 26 | ## Source 27 | You can also install the latest version from source. First clone the repo and then run the installation with `pip`: 28 | 29 | ```bash 30 | git clone https://github.com/huggingface/trl.git 31 | cd trl/ 32 | pip install -e . 33 | ``` 34 | 35 | If you want the development install you can replace the pip install with the following: 36 | 37 | ```bash 38 | pip install -e ".[dev]" 39 | ``` 40 | -------------------------------------------------------------------------------- /training/trl/docs/source/models.md: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder model architectures in transformers such as GPT-2, OPT, and GPT-Neo. In addition, with `AutoModelForSeq2SeqLMWithValueHead` you can use encoder-decoder architectures such as T5. TRL also requires reference models which are frozen copies of the model that is trained. With `create_reference_model` you can easily create a frozen copy and also share layers between the two models to save memory. 4 | 5 | ## PreTrainedModelWrapper 6 | 7 | [[autodoc]] PreTrainedModelWrapper 8 | 9 | ## AutoModelForCausalLMWithValueHead 10 | 11 | 12 | [[autodoc]] AutoModelForCausalLMWithValueHead 13 | - __init__ 14 | - forward 15 | - generate 16 | - _init_weights 17 | 18 | ## AutoModelForSeq2SeqLMWithValueHead 19 | 20 | [[autodoc]] AutoModelForSeq2SeqLMWithValueHead 21 | - __init__ 22 | - forward 23 | - generate 24 | - _init_weights 25 | 26 | ## create_reference_model 27 | 28 | [[autodoc]] create_reference_model -------------------------------------------------------------------------------- /training/open-r1/slurm/README.md: -------------------------------------------------------------------------------- 1 | ## Serving DeepSeek-R1 on 2x8 H100 SLURM nodes with SGLang 2 | 3 | 1. Set up the environment (adjust for your cuda version): 4 | ```bash 5 | conda create -n sglang124 python=3.11 6 | conda activate sglang124 7 | 8 | pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124 9 | 10 | pip install sgl-kernel --force-reinstall --no-deps 11 | pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/ 12 | ``` 13 | 14 | 2. Run the server and wait for the model to load: 15 | ```bash 16 | sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124" 17 | ``` 18 | 19 | 3. Run the data generation script: 20 | ```bash 21 | python scripts/generate_reasoning.py \ 22 | --dataset-name "AI-MO/NuminaMath-1.5" \ 23 | --output-file "numinamath_r1_generations.jsonl" \ 24 | --prompt-column "problem" \ 25 | --uuid-column "problem" \ 26 | --api-addr ":39877" \ 27 | --num-generations 2 \ 28 | --max-tokens 16384 \ 29 | --max-concurrent 200 30 | ``` -------------------------------------------------------------------------------- /training/trl/examples/accelerate_configs/fsdp_qlora.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: true 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: true 11 | fsdp_sharding_strategy: FULL_SHARD 12 | fsdp_state_dict_type: SHARDED_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: false 15 | machine_rank: 0 16 | main_training_function: main 17 | mixed_precision: 'bf16' 18 | num_machines: 1 19 | num_processes: 8 20 | rdzv_backend: static 21 | same_network: true 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false -------------------------------------------------------------------------------- /training/trl/trl/extras/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from ..import_utils import _LazyModule 18 | 19 | 20 | _import_structure = { 21 | "best_of_n_sampler": ["BestOfNSampler"], 22 | } 23 | 24 | if TYPE_CHECKING: 25 | from .best_of_n_sampler import BestOfNSampler 26 | else: 27 | import sys 28 | 29 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 30 | -------------------------------------------------------------------------------- /training/trl/.github/codeql/custom-queries.qls: -------------------------------------------------------------------------------- 1 | import codeql 2 | 3 | from WorkflowString interpolation, Workflow workflow 4 | where 5 | interpolation.getStringValue().matches("${{ github.event.issue.title }}") or 6 | interpolation.getStringValue().matches("${{ github.event.issue.body }}") or 7 | interpolation.getStringValue().matches("${{ github.event.pull_request.title }}") or 8 | interpolation.getStringValue().matches("${{ github.event.pull_request.body }}") or 9 | interpolation.getStringValue().matches("${{ github.event.review.body }}") or 10 | interpolation.getStringValue().matches("${{ github.event.comment.body }}") or 11 | interpolation.getStringValue().matches("${{ github.event.inputs.* }}") or 12 | interpolation.getStringValue().matches("${{ github.event.head_commit.message }}") 13 | interpolation.getStringValue().matches("${{ github.event.* }}") and 14 | ( 15 | step.getKey() = "run" or // Injection in run 16 | step.getKey() = "env" or // Injection via env 17 | step.getKey() = "with" // Injection via with 18 | ) 19 | select workflow, "🚨 Do not use directly as input of action" 20 | -------------------------------------------------------------------------------- /training/trl/trl/environment/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from ..import_utils import _LazyModule 18 | 19 | 20 | _import_structure = { 21 | "base_environment": ["TextEnvironment", "TextHistory"], 22 | } 23 | 24 | if TYPE_CHECKING: 25 | from .base_environment import TextEnvironment, TextHistory 26 | else: 27 | import sys 28 | 29 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 30 | -------------------------------------------------------------------------------- /training/trl/trl/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from ..import_utils import _LazyModule 18 | 19 | 20 | _import_structure = { 21 | "utils": ["init_zero_verbose", "ScriptArguments", "TrlParser"], 22 | } 23 | 24 | if TYPE_CHECKING: 25 | from .utils import ScriptArguments, TrlParser, init_zero_verbose 26 | else: 27 | import sys 28 | 29 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 30 | -------------------------------------------------------------------------------- /training/trl/tests/slow/testing_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | MODELS_TO_TEST = [ 16 | "trl-internal-testing/tiny-LlamaForCausalLM-3.2", 17 | "trl-internal-testing/tiny-MistralForCausalLM-0.2", 18 | ] 19 | 20 | # We could have also not declared these variables but let's be verbose 21 | PACKING_OPTIONS = [True, False] 22 | GRADIENT_CHECKPOINTING_KWARGS = [None, {"use_reentrant": False}, {"use_reentrant": True}] 23 | DEVICE_MAP_OPTIONS = [{"": 0}, "auto"] 24 | 25 | DPO_LOSS_TYPES = ["sigmoid", "ipo"] 26 | DPO_PRECOMPUTE_LOGITS = [True, False] 27 | -------------------------------------------------------------------------------- /training/open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-220k 9 | dataset_num_proc: 48 10 | 11 | # SFT trainer config 12 | bf16: true 13 | do_eval: false 14 | eval_strategy: 'no' 15 | gradient_accumulation_steps: 1 16 | gradient_checkpointing: true 17 | gradient_checkpointing_kwargs: 18 | use_reentrant: false 19 | hub_model_id: Qwen2.5-1.5B-Open-R1-Distill 20 | hub_strategy: every_save 21 | learning_rate: 5.0e-05 22 | log_level: info 23 | logging_steps: 5 24 | logging_strategy: steps 25 | lr_scheduler_type: cosine_with_min_lr 26 | lr_scheduler_kwargs: 27 | min_lr_rate: 0.1 28 | packing: true 29 | max_length: 16384 30 | max_steps: -1 31 | num_train_epochs: 1 32 | output_dir: data/Qwen2.5-1.5B-Open-R1-Distill 33 | overwrite_output_dir: true 34 | per_device_eval_batch_size: 16 35 | per_device_train_batch_size: 16 36 | push_to_hub: true 37 | report_to: 38 | - wandb 39 | save_strategy: "steps" 40 | save_steps: 100 41 | save_total_limit: 1 42 | seed: 42 43 | use_liger: true 44 | warmup_ratio: 0.05 -------------------------------------------------------------------------------- /training/trl/.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: "\U0001F680 Feature request" 2 | description: Submit a proposal/request for a new TRL feature 3 | labels: [ "Feature request" ] 4 | body: 5 | - type: textarea 6 | id: feature-request 7 | validations: 8 | required: true 9 | attributes: 10 | label: Feature request 11 | description: | 12 | A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. 13 | 14 | - type: textarea 15 | id: motivation 16 | validations: 17 | required: true 18 | attributes: 19 | label: Motivation 20 | description: | 21 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. 22 | 23 | 24 | - type: textarea 25 | id: contribution 26 | validations: 27 | required: true 28 | attributes: 29 | label: Your contribution 30 | description: | 31 | Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md) 32 | -------------------------------------------------------------------------------- /training/trl/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test precommit common_tests slow_tests test_examples tests_gpu 2 | 3 | check_dirs := examples tests trl 4 | 5 | ACCELERATE_CONFIG_PATH = `pwd`/examples/accelerate_configs 6 | COMMAND_FILES_PATH = `pwd`/commands 7 | 8 | test: 9 | python -m pytest -n auto --dist=loadfile -s -v --reruns 5 --reruns-delay 1 --only-rerun '(OSError|Timeout|HTTPError.*502|HTTPError.*504||not less than or equal to 0.01)' ./tests/ 10 | 11 | precommit: 12 | python scripts/add_copyrights.py 13 | pre-commit run --all-files 14 | 15 | tests_gpu: 16 | python -m pytest tests/test_* $(if $(IS_GITHUB_CI),--report-log "common_tests.log",) 17 | 18 | slow_tests: 19 | python -m pytest tests/slow/test_* $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",) 20 | 21 | test_examples: 22 | touch temp_results_sft_tests.txt 23 | for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \ 24 | TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_sft.sh; \ 25 | echo $$?','$${file} >> temp_results_sft_tests.txt; \ 26 | done 27 | 28 | touch temp_results_dpo_tests.txt 29 | for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \ 30 | TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_dpo.sh; \ 31 | echo $$?','$${file} >> temp_results_dpo_tests.txt; \ 32 | done 33 | -------------------------------------------------------------------------------- /training/trl/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: 'TRL: Transformer Reinforcement Learning' 3 | message: >- 4 | If you use this software, please cite it using the 5 | metadata from this file. 6 | type: software 7 | authors: 8 | - given-names: Leandro 9 | family-names: von Werra 10 | - given-names: Younes 11 | family-names: Belkada 12 | - given-names: Lewis 13 | family-names: Tunstall 14 | - given-names: Edward 15 | family-names: Beeching 16 | - given-names: Tristan 17 | family-names: Thrush 18 | - given-names: Nathan 19 | family-names: Lambert 20 | - given-names: Shengyi 21 | family-names: Huang 22 | - given-names: Kashif 23 | family-names: Rasul 24 | - given-names: Quentin 25 | family-names: Gallouédec 26 | repository-code: 'https://github.com/huggingface/trl' 27 | abstract: "With trl you can train transformer language models with Proximal Policy Optimization (PPO). The library is built on top of the transformers library by \U0001F917 Hugging Face. Therefore, pre-trained language models can be directly loaded via transformers. At this point, most decoder and encoder-decoder architectures are supported." 28 | keywords: 29 | - rlhf 30 | - deep-learning 31 | - pytorch 32 | - transformers 33 | license: Apache-2.0 34 | version: 0.16 35 | -------------------------------------------------------------------------------- /training/open-r1/recipes/OlympicCoder-7B/sft/config_v00.00.yaml: -------------------------------------------------------------------------------- 1 | # Config for 1 node of 8 H100s with DeepSpeed ZeRO-3 2 | # Model arguments 3 | model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/codeforces-cots 10 | dataset_config: solutions_decontaminated 11 | dataset_num_proc: 48 12 | 13 | # SFT trainer config 14 | bf16: true 15 | do_eval: false 16 | eval_strategy: 'no' 17 | gradient_accumulation_steps: 8 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: open-r1/OlympicCoder-7B 22 | hub_strategy: every_save 23 | learning_rate: 1.0e-05 24 | log_level: info 25 | logging_steps: 1 26 | logging_strategy: steps 27 | lr_scheduler_type: cosine_with_min_lr 28 | lr_scheduler_kwargs: 29 | min_lr_rate: 0.1 30 | packing: false 31 | max_grad_norm: 0.2 32 | max_length: 32768 33 | max_steps: -1 34 | num_train_epochs: 10 35 | output_dir: data/OlympicCoder-7B 36 | overwrite_output_dir: true 37 | per_device_eval_batch_size: 1 38 | per_device_train_batch_size: 2 39 | push_to_hub: true 40 | report_to: 41 | - wandb 42 | save_strategy: epoch 43 | save_total_limit: 1 44 | seed: 42 45 | use_liger: true 46 | warmup_ratio: 0.03 -------------------------------------------------------------------------------- /training/open-r1/scripts/get_tensor_parallel_size.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoConfig 3 | from math import gcd 4 | 5 | def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int: 6 | try: 7 | config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True) 8 | num_heads = getattr(config, 'num_attention_heads', None) 9 | 10 | if num_heads is not None and num_heads % default_tp != 0: 11 | tp = gcd(num_heads, default_tp) 12 | return max(tp, 1) 13 | else: 14 | return default_tp 15 | except Exception as e: 16 | print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}") 17 | return default_tp 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path") 22 | parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable") 23 | parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)") 24 | 25 | args = parser.parse_args() 26 | 27 | tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp) 28 | print(tp) 29 | -------------------------------------------------------------------------------- /annotated_dataset/strlen_analysis.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from datasets import load_dataset 3 | 4 | # Load the dataset 5 | ds = load_dataset("open-r1/OpenR1-Math-220k", "default") 6 | train_ds = ds['train'] 7 | 8 | # Collect data 9 | generation_lengths = [] 10 | generation_ids = [] 11 | from tqdm import tqdm 12 | gen_id = 0 13 | for item in tqdm(train_ds): 14 | for gen in item['generations']: 15 | generation_lengths.append(len(gen)) 16 | generation_ids.append(gen_id) 17 | gen_id += 1 18 | 19 | # Plot: Generation ID vs. String Length 20 | plt.figure(figsize=(10, 6), dpi=600) 21 | plt.plot(generation_ids, generation_lengths, linewidth=0.5) 22 | plt.xlabel("Generation ID") 23 | plt.ylabel("String Length") 24 | plt.title("Generation ID vs. String Length") 25 | plt.tight_layout() 26 | plt.savefig("genid_vs_strlen.png") 27 | plt.close() 28 | 29 | # Plot: Histogram of String Lengths (bin size 500) 30 | plt.figure(figsize=(10, 6), dpi=600) 31 | bins = range(0, max(generation_lengths) + 500, 500) 32 | plt.hist(generation_lengths, bins=bins, edgecolor='black') 33 | plt.xlabel("String Length") 34 | plt.ylabel("Frequency") 35 | plt.title("Histogram of Generation String Lengths") 36 | plt.tight_layout() 37 | plt.savefig("strlen_hist.png") 38 | plt.close() 39 | 40 | print("Plots saved as 'genid_vs_strlen.png' and 'strlen_hist.png'") 41 | -------------------------------------------------------------------------------- /training/trl/.github/ISSUE_TEMPLATE/new-trainer-addition.yml: -------------------------------------------------------------------------------- 1 | name: "\U0001F31F New trainer addition" 2 | description: Submit a proposal/request to implement a new trainer for a post-training method 3 | labels: [ "New trainer" ] 4 | 5 | body: 6 | - type: textarea 7 | id: description-request 8 | validations: 9 | required: true 10 | attributes: 11 | label: Method description 12 | description: | 13 | Put any and all important information relative to the method 14 | 15 | - type: checkboxes 16 | id: information-tasks 17 | attributes: 18 | label: Open source status 19 | description: | 20 | Please note that if the method implementation isn't available or model weights with training datasets aren't available, we are less likely to implement it in `trl`. 21 | options: 22 | - label: "The method implementation is available" 23 | - label: "The model weights are available" 24 | - label: "The training datasets are available" 25 | 26 | - type: textarea 27 | id: additional-info 28 | attributes: 29 | label: Provide useful links for the implementation 30 | description: | 31 | Please provide information regarding the implementation, the weights, and the authors. 32 | Please mention the authors by @gh-username if you're aware of their usernames. 33 | -------------------------------------------------------------------------------- /training/open-r1/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml: -------------------------------------------------------------------------------- 1 | # To start the training, run the following command: 2 | # sbatch -N 4 --job-name=mistral_sft slurm/train.slurm Mistral-Small-24B-Instruct-2501 sft numina zero3 3 | 4 | model_name_or_path: mistralai/Mistral-Small-24B-Instruct-2501 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | # dataset_name: yentinglin/s1K-1.1-trl-format 11 | dataset_name: yentinglin/OpenR1-Math-220k-trl-format 12 | preprocessing_num_workers: 8 13 | 14 | # SFT trainer config 15 | bf16: true 16 | do_eval: true 17 | eval_strategy: no 18 | gradient_accumulation_steps: 4 19 | gradient_checkpointing: true 20 | gradient_checkpointing_kwargs: 21 | use_reentrant: false 22 | hub_model_id: Mistral-Small-24B-Instruct-2501-Open-R1-Distill 23 | hub_strategy: every_save 24 | learning_rate: 2.0e-05 25 | log_level: info 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine 29 | packing: true 30 | max_length: 32768 31 | max_steps: -1 32 | num_train_epochs: 5 33 | output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill 34 | overwrite_output_dir: true 35 | per_device_eval_batch_size: 1 36 | per_device_train_batch_size: 1 37 | push_to_hub: true 38 | report_to: 39 | - wandb 40 | save_strategy: epoch 41 | seed: 42 42 | warmup_ratio: 0.1 43 | -------------------------------------------------------------------------------- /training/open-r1/slurm/serve_router.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=r1-router 3 | #SBATCH --partition=hopper-cpu 4 | #SBATCH --qos=high 5 | #SBATCH --nodes=1 6 | #SBATCH --cpus-per-task=8 7 | #SBATCH --mem-per-cpu=1875m 8 | #SBATCH --output=./logs/%x_%j_%n.out 9 | #SBATCH --error=./logs/%x_%j_%n.err 10 | #SBATCH --time=30-00:00:00 11 | #SBATCH --requeue 12 | 13 | set -exuo pipefail 14 | 15 | # TODO: Adjust these variables to your cluster configuration 16 | CONDA_ENV="sglang124" 17 | ROUTER_PORT=39876 18 | 19 | trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1 20 | 21 | while getopts "e:h" opt; do 22 | case $opt in 23 | e) CONDA_ENV="$OPTARG" ;; 24 | h|?) echo "Usage: sbatch $0 [-e CONDA_ENV]"; exit 1 ;; 25 | esac 26 | done 27 | 28 | # TODO: Environment setup, adjust to your cluster configuration 29 | source ~/.bashrc 30 | source "$CONDA_PREFIX/etc/profile.d/conda.sh" 31 | conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; } 32 | 33 | python -m sglang_router.launch_router \ 34 | --port "$ROUTER_PORT" \ 35 | --host 0.0.0.0 \ 36 | --worker-startup-timeout-secs 300 37 | 38 | # Keep the job running with health checks 39 | while true; do 40 | if ! curl -s -o /dev/null "http://localhost:$ROUTER_PORT/health"; then 41 | echo "Error: Router health check failed" 42 | exit 1 43 | fi 44 | sleep 300 45 | done -------------------------------------------------------------------------------- /training/open-r1/recipes/OpenR1-Qwen-7B/sft/config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json 4 | model_name_or_path: Qwen/Qwen2.5-Math-7B-Instruct 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: sdpa 8 | 9 | # Data training arguments 10 | dataset_name: open-r1/OpenR1-Math-220k 11 | dataset_num_proc: 48 12 | 13 | #SFT hyperparam 14 | max_length: 32768 15 | weight_decay: 0.0001 16 | optim: adamw_torch 17 | lr_scheduler_type: linear 18 | warmup_ratio: 0.1 19 | learning_rate: 5.0e-05 20 | gradient_accumulation_steps: 2 21 | per_device_eval_batch_size: 1 22 | per_device_train_batch_size: 1 23 | 24 | # SFT trainer config 25 | max_steps: -1 26 | num_train_epochs: 3 27 | bf16: true 28 | do_eval: false 29 | use_liger_kernel: true 30 | eval_strategy: 'no' 31 | gradient_checkpointing: true 32 | gradient_checkpointing_kwargs: 33 | use_reentrant: false 34 | hub_model_id: OpenR1-Qwen-7B-SFT 35 | hub_strategy: every_save 36 | log_level: info 37 | logging_steps: 5 38 | logging_strategy: steps 39 | packing: true 40 | output_dir: data/OpenR1-Qwen-7B-SFT 41 | overwrite_output_dir: true 42 | push_to_hub: true 43 | report_to: 44 | - wandb 45 | save_strategy: "steps" 46 | save_steps: 500 47 | save_total_limit: 1 48 | seed: 42 -------------------------------------------------------------------------------- /modes/speculative_decoding.py: -------------------------------------------------------------------------------- 1 | def run_speculative_decoding_flow( 2 | question: str, 3 | big_model: str, 4 | big_model_port: int, 5 | generate_text_vllm, 6 | max_tokens: int, 7 | temperature: float, 8 | test_logging: bool = False, 9 | ): 10 | # NOTE: calls V0 server, since V1 does not have specdec 11 | resp_json, latency, metric = generate_text_vllm( 12 | question, 13 | port=big_model_port, 14 | temperature=temperature, 15 | max_tokens=max_tokens, 16 | model=big_model, 17 | speculative_decoding=True # NOTE: custom parameter in generate_text_vllm 18 | ) 19 | 20 | usage = resp_json.get("usage", {}) 21 | final_reply = resp_json["choices"][0]["text"] 22 | 23 | usage_data = [{ 24 | "Model": big_model, 25 | "ThinkIter": "spec_decoding", 26 | "DraftVersion": 0, 27 | "PromptTokens": usage.get("prompt_tokens", 0), 28 | "CompletionTokens": usage.get("completion_tokens", 0), 29 | "AcceptedTokens": metric["accepted_tokens"], # NOTE: these values are wrong, which are fetched directly from /metrics. 30 | "DraftTokens": metric["draft_tokens"], 31 | "EmittedTokens": metric["emitted_tokens"], 32 | "AcceptanceRate": metric["acceptance_rate"], 33 | "Efficiency": metric["efficiency"], 34 | "Latency": latency, 35 | }] 36 | 37 | return final_reply, usage_data -------------------------------------------------------------------------------- /training/open-r1/recipes/SmolLM2-1.7B/sft/config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: sdpa 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/OpenR1-Math-220k 10 | dataset_num_proc: 48 11 | 12 | #SFT hyperparam 13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 14 | weight_decay: 0.0001 15 | optim: adamw_torch 16 | lr_scheduler_type: linear 17 | warmup_ratio: 0.1 18 | learning_rate: 5.0e-05 19 | gradient_accumulation_steps: 2 20 | per_device_eval_batch_size: 4 21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 22 | 23 | # SFT trainer config 24 | max_steps: -1 25 | num_train_epochs: 3 26 | bf16: true 27 | do_eval: false 28 | eval_strategy: 'no' 29 | gradient_checkpointing: true 30 | gradient_checkpointing_kwargs: 31 | use_reentrant: false 32 | hub_model_id: OpenR1-Qwen-7B-SFT 33 | hub_strategy: every_save 34 | log_level: info 35 | logging_steps: 5 36 | logging_strategy: steps 37 | packing: true 38 | output_dir: data/OpenR1-Qwen-7B-SFT 39 | overwrite_output_dir: true 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | save_strategy: "steps" 44 | save_steps: 500 45 | save_total_limit: 1 46 | seed: 42 47 | -------------------------------------------------------------------------------- /training/open-r1/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: sdpa 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/OpenR1-Math-220k 10 | dataset_num_proc: 48 11 | 12 | #SFT hyperparam 13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 14 | weight_decay: 0.0001 15 | optim: adamw_torch 16 | lr_scheduler_type: linear 17 | warmup_ratio: 0.1 18 | learning_rate: 5.0e-05 19 | gradient_accumulation_steps: 2 20 | per_device_eval_batch_size: 4 21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 22 | 23 | # SFT trainer config 24 | max_steps: -1 25 | num_train_epochs: 3 26 | bf16: true 27 | do_eval: false 28 | eval_strategy: 'no' 29 | gradient_checkpointing: true 30 | gradient_checkpointing_kwargs: 31 | use_reentrant: false 32 | hub_model_id: OpenR1-Qwen-7B-SFT 33 | hub_strategy: every_save 34 | log_level: info 35 | logging_steps: 5 36 | logging_strategy: steps 37 | packing: true 38 | output_dir: data/OpenR1-Qwen-7B-SFT 39 | overwrite_output_dir: true 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | save_strategy: "steps" 44 | save_steps: 500 45 | save_total_limit: 1 46 | seed: 42 47 | -------------------------------------------------------------------------------- /training/open-r1/recipes/OlympicCoder-32B/sft/config_v00.00.yaml: -------------------------------------------------------------------------------- 1 | # Config for 16 nodes of 8 H100s with FSDP1 2 | # Model arguments 3 | model_name_or_path: Qwen/Qwen2.5-Coder-32B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/codeforces-cots 10 | dataset_config: solutions_decontaminated 11 | dataset_num_proc: 12 12 | 13 | # SFT trainer config 14 | bf16: true 15 | do_eval: false 16 | eval_strategy: 'no' 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_always_push: true 22 | hub_model_id: OlympicCoder-32B 23 | hub_strategy: every_save 24 | learning_rate: 4.0e-05 25 | log_level: info 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine_with_min_lr 29 | lr_scheduler_kwargs: 30 | min_lr_rate: 0.1 31 | packing: false 32 | max_grad_norm: 0.2 33 | max_length: 22528 # we were unable to train at 32k due to OOM. See https://github.com/huggingface/transformers/issues/35983 for context parallelism support. 34 | max_steps: -1 35 | num_train_epochs: 10 36 | optim: paged_adamw_8bit 37 | output_dir: data/OlympicCoder-32B 38 | overwrite_output_dir: true 39 | per_device_eval_batch_size: 1 40 | per_device_train_batch_size: 1 41 | push_to_hub: true 42 | report_to: 43 | - wandb 44 | save_only_model: true # needed to bypass FSDP errors with saving paged optimizers 45 | save_strategy: epoch 46 | save_total_limit: 1 47 | seed: 42 48 | use_liger: false # fails on multi-node 49 | warmup_ratio: 0.03 -------------------------------------------------------------------------------- /training/trl/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # What does this PR do? 2 | 3 | 12 | 13 | 14 | 15 | Fixes # (issue) 16 | 17 | 18 | ## Before submitting 19 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). 20 | - [ ] Did you read the [contributor guideline](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md#create-a-pull-request), 21 | Pull Request section? 22 | - [ ] Was this discussed/approved via a GitHub issue? Please add a link 23 | to it if that's the case. 24 | - [ ] Did you make sure to update the documentation with your changes? Here are the 25 | [documentation guidelines](https://github.com/huggingface/trl/tree/main/docs). 26 | - [ ] Did you write any new necessary tests? 27 | 28 | 29 | ## Who can review? 30 | 31 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag 32 | members/contributors who may be interested in your PR. -------------------------------------------------------------------------------- /lm_eval_files/aime/aime_2024_agg8.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime_2024_agg8 4 | dataset_path: Maxwell-Jia/AIME_2024 5 | dataset_name: default 6 | process_docs: !function utils.process_docs_aime_2024 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text_aime_2024 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 4096 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 17 | repeats: 8 18 | filter_list: 19 | - name: "all" # Will do coverage, majority, and take_first_k 20 | filter: 21 | - function: "take_first_k" 22 | k: 8 23 | metric_list: 24 | - metric: exact_match 25 | aggregation: mean 26 | higher_is_better: true 27 | - metric: cov@8 28 | aggregation: mean 29 | higher_is_better: true 30 | - metric: cov@4 31 | aggregation: mean 32 | higher_is_better: true 33 | - metric: cov@2 34 | aggregation: mean 35 | higher_is_better: true 36 | - metric: maj@8 37 | aggregation: mean 38 | higher_is_better: true 39 | - metric: maj@4 40 | aggregation: mean 41 | higher_is_better: true 42 | - metric: maj@2 43 | aggregation: mean 44 | higher_is_better: true 45 | - metric: extracted_answers 46 | aggregation: bypass 47 | higher_is_better: true 48 | - metric: exact_matches 49 | aggregation: bypass 50 | higher_is_better: true 51 | metadata: 52 | version: 1.0 -------------------------------------------------------------------------------- /training/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-220k 9 | dataset_prompt_column: problem 10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 11 | 12 | # GRPO trainer config 13 | bf16: true 14 | use_vllm: true 15 | do_eval: false 16 | gradient_accumulation_steps: 4 17 | gradient_checkpointing: true 18 | gradient_checkpointing_kwargs: 19 | use_reentrant: false 20 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 21 | hub_strategy: every_save 22 | learning_rate: 2.0e-05 23 | log_completions: true 24 | log_level: info 25 | logging_first_step: true 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine 29 | max_prompt_length: 512 30 | max_completion_length: 1024 31 | max_steps: -1 32 | num_generations: 16 33 | num_train_epochs: 1 34 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO 35 | overwrite_output_dir: true 36 | per_device_eval_batch_size: 16 37 | per_device_train_batch_size: 16 38 | push_to_hub: true 39 | report_to: 40 | - wandb 41 | reward_funcs: 42 | - accuracy 43 | - format 44 | - tag_count 45 | reward_weights: 46 | - 1.0 47 | - 1.0 48 | - 1.0 49 | save_strategy: "epoch" 50 | save_total_limit: 1 51 | seed: 42 52 | warmup_ratio: 0.1 53 | -------------------------------------------------------------------------------- /training/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-Math-7B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: DigitalLearningGmbH/MATH-lighteval 9 | dataset_config: default 10 | dataset_prompt_column: problem 11 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 12 | 13 | # GRPO trainer config 14 | bf16: true 15 | use_vllm: true 16 | do_eval: true 17 | eval_strategy: steps 18 | eval_steps: 100 19 | gradient_accumulation_steps: 8 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: Qwen-2.5-7B-Simple-RL 24 | hub_strategy: every_save 25 | learning_rate: 3.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 5 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine 32 | max_prompt_length: 512 33 | max_completion_length: 1024 34 | max_steps: -1 35 | num_generations: 7 36 | num_train_epochs: 1 37 | output_dir: data/Qwen-2.5-7B-Simple-RL 38 | overwrite_output_dir: true 39 | per_device_eval_batch_size: 16 40 | per_device_train_batch_size: 16 41 | push_to_hub: true 42 | report_to: 43 | - wandb 44 | reward_funcs: 45 | - accuracy 46 | - format 47 | reward_weights: 48 | - 1.0 49 | - 1.0 50 | save_strategy: "no" 51 | seed: 42 52 | warmup_ratio: 0.1 53 | -------------------------------------------------------------------------------- /training/trl/docs/source/iterative_sft_trainer.md: -------------------------------------------------------------------------------- 1 | # Iterative Trainer 2 | 3 | [![](https://img.shields.io/badge/All_models-Iterative_SFT-blue)](https://huggingface.co/models?other=iterative-sft,trl) 4 | 5 | 6 | Iterative fine-tuning is a training method that enables to perform custom actions (generation and filtering for example) between optimization steps. In TRL we provide an easy-to-use API to fine-tune your models in an iterative way in just a few lines of code. 7 | 8 | ## Usage 9 | 10 | To get started quickly, instantiate an instance a model, and a tokenizer. 11 | 12 | ```python 13 | 14 | model = AutoModelForCausalLM.from_pretrained(model_name) 15 | tokenizer = AutoTokenizer.from_pretrained(model_name) 16 | if tokenizer.pad_token is None: 17 | tokenizer.pad_token = tokenizer.eos_token 18 | 19 | trainer = IterativeSFTTrainer( 20 | model, 21 | tokenizer 22 | ) 23 | 24 | ``` 25 | 26 | You have the choice to either provide a list of strings or a list of tensors to the step function. 27 | 28 | #### Using a list of tensors as input: 29 | 30 | ```python 31 | 32 | inputs = { 33 | "input_ids": input_ids, 34 | "attention_mask": attention_mask 35 | } 36 | 37 | trainer.step(**inputs) 38 | 39 | ``` 40 | 41 | #### Using a list of strings as input: 42 | 43 | ```python 44 | 45 | inputs = { 46 | "texts": texts 47 | } 48 | 49 | trainer.step(**inputs) 50 | 51 | ``` 52 | 53 | For causal language models, labels will automatically be created from input_ids or from texts. When using sequence to sequence models you will have to provide your own labels or text_labels. 54 | 55 | ## IterativeTrainer 56 | 57 | [[autodoc]] IterativeSFTTrainer 58 | -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, PreTrainedTokenizer 2 | 3 | from trl import ModelConfig 4 | 5 | from ..configs import GRPOConfig, SFTConfig 6 | 7 | 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" 9 | 10 | 11 | def get_tokenizer( 12 | model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True 13 | ) -> PreTrainedTokenizer: 14 | """Get the tokenizer for the model.""" 15 | tokenizer = AutoTokenizer.from_pretrained( 16 | model_args.model_name_or_path, 17 | revision=model_args.model_revision, 18 | trust_remote_code=model_args.trust_remote_code, 19 | ) 20 | # Disable in interest of time. 21 | # special_tokens_dict = {"additional_special_tokens": ["", ""]} 22 | # num_added = tokenizer.add_special_tokens(special_tokens_dict) 23 | # if num_added > 0: 24 | # print(f"Added {num_added} special tokens for orchestration.") 25 | 26 | if training_args.chat_template is not None: 27 | tokenizer.chat_template = training_args.chat_template 28 | elif auto_set_chat_template and tokenizer.get_chat_template() is None: 29 | tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE 30 | 31 | return tokenizer 32 | -------------------------------------------------------------------------------- /training/open-r1/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-7B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-cn_k12-86k 9 | dataset_prompt_column: problem 10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 11 | 12 | # GRPO trainer config 13 | beta: 0.001 14 | bf16: true 15 | do_eval: false 16 | eval_strategy: "no" 17 | use_vllm: true 18 | do_eval: false 19 | gradient_accumulation_steps: 16 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: Qwen2.5-7B-Instruct-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: constant_with_warmup 32 | max_grad_norm: 0.2 33 | max_prompt_length: 1024 34 | max_completion_length: 4096 35 | max_steps: -1 36 | num_generations: 16 37 | num_train_epochs: 1 38 | output_dir: data/Qwen2.5-7B-Instruct-GRPO 39 | overwrite_output_dir: true 40 | per_device_train_batch_size: 4 41 | push_to_hub: true 42 | report_to: 43 | - wandb 44 | reward_funcs: 45 | - accuracy 46 | - format 47 | reward_weights: 48 | - 1.0 49 | - 0.2 50 | save_strategy: "steps" 51 | save_steps: 0.1 52 | save_total_limit: 1 53 | seed: 42 54 | temperature: 0.7 55 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /training/trl/.github/workflows/tests_latest.yml: -------------------------------------------------------------------------------- 1 | name: Tests latest TRL release with dev dependencies 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' # Runs daily at midnight UTC 6 | 7 | workflow_dispatch: 8 | 9 | env: 10 | TQDM_DISABLE: 1 11 | CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }} 12 | 13 | jobs: 14 | tests: 15 | name: Tests latest TRL release with dev dependencies 16 | runs-on: 'ubuntu-latest' 17 | steps: 18 | - name: Git checkout 19 | uses: actions/checkout@v4 20 | with: { ref: v0.16-release } 21 | - name: Set up Python 3.12 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: '3.12' 25 | cache: "pip" 26 | cache-dependency-path: | 27 | setup.py 28 | requirements.txt 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m pip install -U git+https://github.com/huggingface/accelerate.git 33 | python -m pip install -U git+https://github.com/huggingface/datasets.git 34 | python -m pip install -U git+https://github.com/huggingface/transformers.git 35 | python -m pip install ".[dev]" 36 | - name: Test with pytest 37 | run: | 38 | make test 39 | - name: Post to Slack 40 | uses: huggingface/hf-workflows/.github/actions/post-slack@main 41 | with: 42 | slack_channel: ${{ env.CI_SLACK_CHANNEL }} 43 | title: Results of latest TRL with Python 3.12 on ubuntu-latest with dev dependencies 44 | status: ${{ job.status }} 45 | slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} 46 | -------------------------------------------------------------------------------- /training/trl/docs/source/deepspeed_integration.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed Integration 2 | 3 | 4 | 5 | Section under construction. Feel free to contribute! 6 | 7 | 8 | 9 | TRL supports training with DeepSpeed, a library that implements advanced training optimization techniques. These include optimizer state partitioning, offloading, gradient partitioning, and more. 10 | 11 | DeepSpeed integrates the [Zero Redundancy Optimizer (ZeRO)](https://huggingface.co/papers/1910.02054), which allows to scale the model size proportional to the number of devices with sustained high efficiency. 12 | 13 | ![ZeRO Stages](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/zero_stages.png) 14 | 15 | ## Installation 16 | 17 | To use DeepSpeed with TRL, install it using the following command: 18 | 19 | ```bash 20 | pip install deepspeed 21 | ``` 22 | 23 | ## Running Training Scripts with DeepSpeed 24 | 25 | No modifications to your training script are required. Simply run it with the DeepSpeed configuration file: 26 | 27 | ```bash 28 | accelerate launch --config_file train.py 29 | ``` 30 | 31 | We provide ready-to-use DeepSpeed configuration files in the [`examples/accelerate_configs`](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) directory. For example, to run training with ZeRO Stage 2, use the following command: 32 | 33 | ```bash 34 | accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml train.py 35 | ``` 36 | 37 | ## Additional Resources 38 | 39 | Consult the 🤗 Accelerate [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more information about the DeepSpeed plugin. 40 | -------------------------------------------------------------------------------- /training/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/verifiable-coding-problems-python 9 | dataset_prompt_column: problem_statement 10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 11 | 12 | # GRPO trainer config 13 | beta: 0.01 14 | bf16: true 15 | use_vllm: true 16 | do_eval: false 17 | gradient_accumulation_steps: 4 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 5.0e-06 24 | log_completions: true 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: cosine_with_min_lr 30 | lr_scheduler_kwargs: 31 | min_lr_rate: 0.1 32 | max_prompt_length: 1024 33 | max_completion_length: 2048 34 | max_steps: 500 35 | num_generations: 14 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO 38 | overwrite_output_dir: true 39 | per_device_train_batch_size: 16 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | reward_funcs: 44 | - code 45 | - format 46 | reward_weights: 47 | - 1.0 48 | - 0.1 49 | save_strategy: "steps" 50 | save_steps: 50 51 | save_total_limit: 1 52 | seed: 42 53 | temperature: 1.0 54 | warmup_ratio: 0.03 -------------------------------------------------------------------------------- /annotated_dataset/hf_Dataset/proprocess.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, DatasetDict, Dataset, load_from_disk 2 | from huggingface_hub import login 3 | import os 4 | 5 | our_dataset = load_from_disk("OpenR1_Math_SpeculativeReasoning") 6 | 7 | print("Length pre filtering:", len(our_dataset)) 8 | # First, filter out the examples you want to exclude 9 | filtered_dataset = our_dataset.filter( 10 | lambda example: ( 11 | example["annotated_generations"] and 12 | "Error processing Due To" not in example["annotated_generations"][0] and 13 | len(example["annotated_generations"][0]) > 1024 14 | ) 15 | ) 16 | 17 | min_len = min( 18 | len(example["annotated_generations"][0]) 19 | for example in filtered_dataset 20 | if example["annotated_generations"] 21 | ) 22 | shortest_examples = our_dataset.filter( 23 | lambda example: ( 24 | example["annotated_generations"] and 25 | len(example["annotated_generations"][0]) == min_len 26 | ) 27 | ) 28 | 29 | print("Example of shortest example:", shortest_examples['generations'][0][0]) 30 | print("Example of shortest example:", shortest_examples['annotated_generations'][0][0]) 31 | 32 | print("Length post filtering:", len(filtered_dataset)) 33 | def replace_with_annotated(example): 34 | if example["annotated_generations"]: 35 | example["messages"][1]["content"] = example["annotated_generations"][0].replace("<\\bigmodel>", "\n \n") 36 | example["messages"][1]["content"] = example["annotated_generations"][0].replace("", "\n \n") 37 | 38 | return example 39 | 40 | updated_dataset = filtered_dataset.map(replace_with_annotated) 41 | 42 | updated_dataset.push_to_hub("akhauriyash/OpenR1_Math_SpeculativeReasoning") -------------------------------------------------------------------------------- /training/trl/commands/run_sft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script runs an SFT example end-to-end on a tiny model using different possible configurations 3 | # but defaults to QLoRA + PEFT 4 | OUTPUT_DIR="test_sft/" 5 | MODEL_NAME="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" 6 | DATASET_NAME="stanfordnlp/imdb" 7 | MAX_STEPS=5 8 | BATCH_SIZE=2 9 | SEQ_LEN=128 10 | 11 | 12 | # Handle extra arguments in case one passes accelerate configs. 13 | EXTRA_ACCELERATE_ARGS="" 14 | EXTRA_TRAINING_ARGS="""--use_peft \ 15 | --load_in_4bit 16 | """ 17 | 18 | # Set your number of GPUs here 19 | NUM_GPUS=2 20 | 21 | if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then 22 | EXTRA_ACCELERATE_ARGS="" 23 | else 24 | EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG" 25 | # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed 26 | # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training. 27 | if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then 28 | EXTRA_TRAINING_ARGS="--fp16" 29 | else 30 | echo "Keeping QLoRA + PEFT" 31 | fi 32 | fi 33 | 34 | 35 | CMD=""" 36 | accelerate launch $EXTRA_ACCELERATE_ARGS \ 37 | --num_processes $NUM_GPUS \ 38 | --mixed_precision 'fp16' \ 39 | `pwd`/trl/scripts/sft.py \ 40 | --model_name $MODEL_NAME \ 41 | --dataset_name $DATASET_NAME \ 42 | --output_dir $OUTPUT_DIR \ 43 | --max_steps $MAX_STEPS \ 44 | --per_device_train_batch_size $BATCH_SIZE \ 45 | --max_length $SEQ_LEN \ 46 | $EXTRA_TRAINING_ARGS 47 | """ 48 | 49 | echo "Starting program..." 50 | 51 | { # try 52 | echo $CMD 53 | eval "$CMD" 54 | } || { # catch 55 | # save log for exception 56 | echo "Operation Failed!" 57 | exit 1 58 | } 59 | exit 0 60 | -------------------------------------------------------------------------------- /training/open-r1/slurm/piston/launch_single_piston.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=piston_worker 3 | #SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out 4 | #SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out # Redirect error logs to .out 5 | #SBATCH --cpus-per-task=2 6 | #SBATCH --mem-per-cpu=1950M 7 | #SBATCH --partition=hopper-cpu 8 | #SBATCH --time=48:00:00 9 | 10 | # sometimes if a bunch of workers start at the same time pyxis dies 11 | sleep $(( RANDOM % 20 )) 12 | 13 | # mounting the packages folder lets us not have to manually install the package on each instance 14 | # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility) 15 | # feel free try with the latest image 16 | # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package 17 | srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \ 18 | bash -c " 19 | export PISTON_COMPILE_TIMEOUT=60000 20 | export PISTON_RUN_TIMEOUT=60000 21 | export PISTON_OUTPUT_MAX_SIZE=1000000000 22 | export PISTON_MAX_FILE_SIZE=1000000000 23 | export PISTON_DISABLE_NETWORKING=true 24 | export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index 25 | 26 | sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js 27 | sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js 28 | 29 | # Start server in background 30 | node src 31 | " 32 | -------------------------------------------------------------------------------- /training/trl/commands/run_dpo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script runs an SFT example end-to-end on a tiny model using different possible configurations 3 | # but defaults to QLoRA + PEFT 4 | OUTPUT_DIR="test_dpo/" 5 | MODEL_NAME="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" 6 | DATASET_NAME="trl-internal-testing/hh-rlhf-helpful-base-trl-style" 7 | MAX_STEPS=5 8 | BATCH_SIZE=2 9 | SEQ_LEN=128 10 | 11 | # Handle extra arguments in case one passes accelerate configs. 12 | EXTRA_ACCELERATE_ARGS="" 13 | EXTRA_TRAINING_ARGS="""--use_peft \ 14 | --load_in_4bit 15 | """ 16 | 17 | # This is a hack to get the number of available GPUs 18 | NUM_GPUS=2 19 | 20 | if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then 21 | EXTRA_ACCELERATE_ARGS="" 22 | else 23 | EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG" 24 | # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed 25 | # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training. 26 | if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then 27 | EXTRA_TRAINING_ARGS="--fp16" 28 | else 29 | echo "Keeping QLoRA + PEFT" 30 | fi 31 | fi 32 | 33 | 34 | CMD=""" 35 | accelerate launch $EXTRA_ACCELERATE_ARGS \ 36 | --num_processes $NUM_GPUS \ 37 | --mixed_precision 'fp16' \ 38 | `pwd`/trl/scripts/dpo.py \ 39 | --model_name_or_path $MODEL_NAME \ 40 | --dataset_name $DATASET_NAME \ 41 | --output_dir $OUTPUT_DIR \ 42 | --max_steps $MAX_STEPS \ 43 | --per_device_train_batch_size $BATCH_SIZE \ 44 | --max_length $SEQ_LEN \ 45 | $EXTRA_TRAINING_ARGS 46 | """ 47 | 48 | echo "Starting program..." 49 | 50 | { # try 51 | echo $CMD 52 | eval "$CMD" 53 | } || { # catch 54 | # save log for exception 55 | echo "Operation Failed!" 56 | exit 1 57 | } 58 | exit 0 59 | -------------------------------------------------------------------------------- /training/trl/trl/trainer/xpo_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass, field 16 | 17 | from trl.trainer.online_dpo_config import OnlineDPOConfig 18 | 19 | 20 | @dataclass 21 | class XPOConfig(OnlineDPOConfig): 22 | r""" 23 | Configuration class for the [`XPOTrainer`]. 24 | 25 | Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following: 26 | 27 | Parameters: 28 | alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`): 29 | Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch 30 | and the last alpha is used for the rest of the epochs. 31 | """ 32 | 33 | alpha: list[float] = field( 34 | default_factory=lambda: [1e-5], 35 | metadata={ 36 | "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each " 37 | "new epoch and the last alpha is used for the rest of the epochs." 38 | }, 39 | ) 40 | 41 | def __post_init__(self): 42 | super().__post_init__() 43 | if hasattr(self.alpha, "__len__") and len(self.alpha) == 1: 44 | self.alpha = self.alpha[0] 45 | -------------------------------------------------------------------------------- /lm_eval_files/aime/aime_2024_rebase.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime_2024_rebase 4 | dataset_path: Maxwell-Jia/AIME_2024 5 | dataset_name: default 6 | process_docs: !function utils.process_docs_aime_2024 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text_aime_2024 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 0 16 | max_gen_toks: 4096 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | - metric: cov@64 22 | aggregation: mean 23 | higher_is_better: true 24 | - metric: cov@32 25 | aggregation: mean 26 | higher_is_better: true 27 | - metric: cov@16 28 | aggregation: mean 29 | higher_is_better: true 30 | - metric: cov@8 31 | aggregation: mean 32 | higher_is_better: true 33 | - metric: cov@4 34 | aggregation: mean 35 | higher_is_better: true 36 | - metric: cov@2 37 | aggregation: mean 38 | higher_is_better: true 39 | - metric: maj@64 40 | aggregation: mean 41 | higher_is_better: true 42 | - metric: maj@32 43 | aggregation: mean 44 | higher_is_better: true 45 | - metric: maj@16 46 | aggregation: mean 47 | higher_is_better: true 48 | - metric: maj@8 49 | aggregation: mean 50 | higher_is_better: true 51 | - metric: maj@4 52 | aggregation: mean 53 | higher_is_better: true 54 | - metric: maj@2 55 | aggregation: mean 56 | higher_is_better: true 57 | - metric: extracted_answers 58 | aggregation: bypass 59 | higher_is_better: true 60 | - metric: exact_matches 61 | aggregation: bypass 62 | higher_is_better: true 63 | metadata: 64 | version: 1.0 -------------------------------------------------------------------------------- /training/open-r1/src/open_r1/utils/ioi/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from functools import lru_cache 3 | from itertools import islice 4 | 5 | from datasets import load_dataset 6 | 7 | 8 | def add_includes(code: str, problem_id: str) -> str: 9 | """ 10 | Fix common compilation errors for IOI problems. 11 | """ 12 | if not code: 13 | return code 14 | # has most of the useful functions 15 | code_header = "#include \n" 16 | # include the problem header 17 | problem_header_include = f'#include "{problem_id}.h"' 18 | if problem_header_include not in code: 19 | code_header += problem_header_include + "\n" 20 | # use namespace std since models forget std:: often 21 | if "using namespace std;" not in code and "std::" not in code: 22 | code_header += "\nusing namespace std;\n\n" 23 | return code_header + code 24 | 25 | 26 | @lru_cache 27 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]: 28 | """ 29 | Load IOI tests for a given year. 30 | """ 31 | tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train") 32 | test_cases = defaultdict(dict) 33 | for test_case in tests_dataset: 34 | test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"] 35 | return test_cases 36 | 37 | 38 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]: 39 | """ 40 | Load IOI tests for a given year and problem id. 41 | """ 42 | return load_ioi_tests_for_year(year)[problem_id] 43 | 44 | 45 | def batched(iterable, n): 46 | "Batch data into lists of length n. The last batch may be shorter." 47 | # batched('ABCDEFG', 3) --> ABC DEF G 48 | if n < 1: 49 | return iterable 50 | it = iter(iterable) 51 | while batch := list(islice(it, n)): 52 | yield batch 53 | -------------------------------------------------------------------------------- /training/trl/tests/test_core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | import torch 18 | 19 | from trl.core import masked_mean, masked_var, masked_whiten 20 | 21 | 22 | class CoreTester(unittest.TestCase): 23 | """ 24 | A wrapper class for testing core utils functions 25 | """ 26 | 27 | def setUp(self): 28 | self.test_input = torch.Tensor([1, 2, 3, 4]) 29 | self.test_mask = torch.Tensor([0, 1, 1, 0]) 30 | self.test_input_unmasked = self.test_input[1:3] 31 | 32 | def test_masked_mean(self): 33 | self.assertEqual(torch.mean(self.test_input_unmasked), masked_mean(self.test_input, self.test_mask)) 34 | 35 | def test_masked_var(self): 36 | self.assertEqual(torch.var(self.test_input_unmasked), masked_var(self.test_input, self.test_mask)) 37 | 38 | def test_masked_whiten(self): 39 | def whiten(values: torch.Tensor) -> torch.Tensor: 40 | mean, var = torch.mean(values), torch.var(values) 41 | return (values - mean) * torch.rsqrt(var + 1e-8) 42 | 43 | whiten_unmasked = whiten(self.test_input_unmasked) 44 | whiten_masked = masked_whiten(self.test_input, self.test_mask)[1:3] 45 | diffs = (whiten_unmasked - whiten_masked).sum() 46 | self.assertLess(abs(diffs.item()), 0.00001) 47 | -------------------------------------------------------------------------------- /training/open-r1/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style quality 2 | 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) 4 | export PYTHONPATH = src 5 | 6 | check_dirs := src tests 7 | 8 | 9 | # dev dependencies 10 | install: 11 | uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip 12 | uv pip install vllm==0.7.2 13 | uv pip install setuptools 14 | uv pip install flash-attn --no-build-isolation 15 | GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]" 16 | 17 | style: 18 | ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py 19 | isort $(check_dirs) setup.py 20 | 21 | quality: 22 | ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py 23 | isort --check-only $(check_dirs) setup.py 24 | flake8 --max-line-length 119 $(check_dirs) setup.py 25 | 26 | test: 27 | pytest -sv --ignore=tests/slow/ tests/ 28 | 29 | slow_test: 30 | pytest -sv -vv tests/slow/ 31 | 32 | # Evaluation 33 | 34 | evaluate: 35 | $(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \ 36 | if [ "$(PARALLEL)" = "data" ]; then \ 37 | echo "data_parallel_size=$(NUM_GPUS)"; \ 38 | elif [ "$(PARALLEL)" = "tensor" ]; then \ 39 | echo "tensor_parallel_size=$(NUM_GPUS)"; \ 40 | fi \ 41 | ),)) 42 | $(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \ 43 | MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \ 44 | if [ "$(TASK)" = "lcb" ]; then \ 45 | lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \ 46 | --use-chat-template \ 47 | --output-dir data/evals/$(MODEL); \ 48 | else \ 49 | lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \ 50 | --custom-tasks src/open_r1/evaluate.py \ 51 | --use-chat-template \ 52 | --output-dir data/evals/$(MODEL); \ 53 | fi 54 | -------------------------------------------------------------------------------- /lm_eval_files/aime/aime25_nofigures_agg64.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime25_nofigures_agg64 4 | dataset_path: TIGER-Lab/AIME25 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 1 16 | max_gen_toks: 32768 17 | repeats: 64 18 | filter_list: 19 | - name: "all" # Will do coverage, majority, and take_first_k 20 | filter: 21 | - function: "take_first_k" 22 | k: 64 23 | metric_list: 24 | - metric: exact_match 25 | aggregation: mean 26 | higher_is_better: true 27 | - metric: cov@64 28 | aggregation: mean 29 | higher_is_better: true 30 | - metric: cov@32 31 | aggregation: mean 32 | higher_is_better: true 33 | - metric: cov@16 34 | aggregation: mean 35 | higher_is_better: true 36 | - metric: cov@8 37 | aggregation: mean 38 | higher_is_better: true 39 | - metric: cov@4 40 | aggregation: mean 41 | higher_is_better: true 42 | - metric: cov@2 43 | aggregation: mean 44 | higher_is_better: true 45 | - metric: maj@64 46 | aggregation: mean 47 | higher_is_better: true 48 | - metric: maj@32 49 | aggregation: mean 50 | higher_is_better: true 51 | - metric: maj@16 52 | aggregation: mean 53 | higher_is_better: true 54 | - metric: maj@8 55 | aggregation: mean 56 | higher_is_better: true 57 | - metric: maj@4 58 | aggregation: mean 59 | higher_is_better: true 60 | - metric: maj@2 61 | aggregation: mean 62 | higher_is_better: true 63 | - metric: extracted_answers 64 | aggregation: bypass 65 | higher_is_better: true 66 | - metric: exact_matches 67 | aggregation: bypass 68 | higher_is_better: true 69 | metadata: 70 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math_maj64_cov64_train.yaml: -------------------------------------------------------------------------------- 1 | group: 2 | - math_word_problems 3 | task: openai_math_maj64_cov64_train 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: train 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: [] 13 | do_sample: true 14 | temperature: 0.5 15 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 16 | metric_list: 17 | - metric: exact_match 18 | aggregation: mean 19 | higher_is_better: true 20 | repeats: 64 21 | filter_list: 22 | - name: "score-first" # pick only the first response, and report metrics on that 23 | filter: 24 | - function: "take_first" 25 | - name: "maj@64" 26 | filter: 27 | - function: "majority_vote" 28 | - function: "take_first" 29 | - name: "maj@16" # get Maj@16, via selecting the first 8 responses. Using a better estimator would be optimal. 30 | filter: 31 | - function: "take_first_k" 32 | k: 16 33 | - function: "majority_vote" 34 | - function: "take_first" 35 | - name: "maj@8" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal. 36 | filter: 37 | - function: "take_first_k" 38 | k: 8 39 | - function: "majority_vote" 40 | - function: "take_first" 41 | - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator. 42 | filter: 43 | - function: "take_first_k" 44 | k: 64 45 | - name: "cov@16" 46 | filter: 47 | - function: "take_first_k" 48 | k: 16 49 | - name: "cov@8" 50 | filter: 51 | - function: "take_first_k" 52 | k: 8 53 | metadata: 54 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/aime/aime24_figures_agg64.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime24_figures_agg64 4 | dataset_path: simplescaling/aime24_figures 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 1 16 | max_gen_toks: 32768 17 | repeats: 64 18 | filter_list: 19 | - name: "all" # Will do coverage, majority, and take_first_k 20 | filter: 21 | - function: "take_first_k" 22 | k: 64 23 | metric_list: 24 | - metric: exact_match 25 | aggregation: mean 26 | higher_is_better: true 27 | - metric: cov@64 28 | aggregation: mean 29 | higher_is_better: true 30 | - metric: cov@32 31 | aggregation: mean 32 | higher_is_better: true 33 | - metric: cov@16 34 | aggregation: mean 35 | higher_is_better: true 36 | - metric: cov@8 37 | aggregation: mean 38 | higher_is_better: true 39 | - metric: cov@4 40 | aggregation: mean 41 | higher_is_better: true 42 | - metric: cov@2 43 | aggregation: mean 44 | higher_is_better: true 45 | - metric: maj@64 46 | aggregation: mean 47 | higher_is_better: true 48 | - metric: maj@32 49 | aggregation: mean 50 | higher_is_better: true 51 | - metric: maj@16 52 | aggregation: mean 53 | higher_is_better: true 54 | - metric: maj@8 55 | aggregation: mean 56 | higher_is_better: true 57 | - metric: maj@4 58 | aggregation: mean 59 | higher_is_better: true 60 | - metric: maj@2 61 | aggregation: mean 62 | higher_is_better: true 63 | - metric: extracted_answers 64 | aggregation: bypass 65 | higher_is_better: true 66 | - metric: exact_matches 67 | aggregation: bypass 68 | higher_is_better: true 69 | metadata: 70 | version: 1.0 -------------------------------------------------------------------------------- /training/trl/examples/research_projects/stack_llama/scripts/README.md: -------------------------------------------------------------------------------- 1 | # RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model. 2 | There were three main steps to the training process: 3 | 1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se: 4 | - `torchrun --nnodes 1 --nproc_per_node 8 examples/research_projects/stack_llama/scripts/supervised_finetuning.py --model_path= --streaming --learning_rate 1e-5 --max_steps 5000 --output_dir ./llama-se` 5 | 2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm: 6 | - `torchrun --nnodes 1 --nproc_per_node 8 examples/research_projects/stack_llama/scripts/reward_modeling.py --model_name=` 7 | 3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model: 8 | - `accelerate launch --multi_gpu --num_machines 1 --num_processes 8 examples/research_projects/stack_llama/scripts/rl_training.py --log_with=wandb --model_name= --reward_model_name= --adafactor=False --tokenizer_name= --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam` 9 | 10 | 11 | LoRA layers were using at all stages to reduce memory requirements. 12 | At each stage the peft adapter layers were merged with the base model, using: 13 | ```shell 14 | python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ 15 | ``` 16 | Note that this script requires `peft>=0.3.0`. 17 | 18 | For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform). 19 | -------------------------------------------------------------------------------- /lm_eval_files/aime/aime24_nofigures_agg64.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime24_nofigures_agg64 4 | dataset_path: simplescaling/aime24_nofigures 5 | dataset_name: default 6 | process_docs: !function utils.process_docs 7 | output_type: generate_until 8 | test_split: train 9 | doc_to_text: !function utils.doc_to_text 10 | doc_to_target: answer 11 | process_results: !function utils.process_results 12 | generation_kwargs: 13 | until: [] 14 | do_sample: false 15 | temperature: 1 16 | max_gen_toks: 32768 17 | repeats: 64 18 | filter_list: 19 | - name: "all" # Will do coverage, majority, and take_first_k 20 | filter: 21 | - function: "take_first_k" 22 | k: 64 23 | metric_list: 24 | - metric: exact_match 25 | aggregation: mean 26 | higher_is_better: true 27 | - metric: cov@64 28 | aggregation: mean 29 | higher_is_better: true 30 | - metric: cov@32 31 | aggregation: mean 32 | higher_is_better: true 33 | - metric: cov@16 34 | aggregation: mean 35 | higher_is_better: true 36 | - metric: cov@8 37 | aggregation: mean 38 | higher_is_better: true 39 | - metric: cov@4 40 | aggregation: mean 41 | higher_is_better: true 42 | - metric: cov@2 43 | aggregation: mean 44 | higher_is_better: true 45 | - metric: maj@64 46 | aggregation: mean 47 | higher_is_better: true 48 | - metric: maj@32 49 | aggregation: mean 50 | higher_is_better: true 51 | - metric: maj@16 52 | aggregation: mean 53 | higher_is_better: true 54 | - metric: maj@8 55 | aggregation: mean 56 | higher_is_better: true 57 | - metric: maj@4 58 | aggregation: mean 59 | higher_is_better: true 60 | - metric: maj@2 61 | aggregation: mean 62 | higher_is_better: true 63 | - metric: extracted_answers 64 | aggregation: bypass 65 | higher_is_better: true 66 | - metric: exact_matches 67 | aggregation: bypass 68 | higher_is_better: true 69 | metadata: 70 | version: 1.0 -------------------------------------------------------------------------------- /lm_eval_files/aime/aime25_nofigures_maj8cov8.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime25_nofigures_maj8cov8 4 | dataset_path: TIGER-Lab/AIME25 5 | dataset_name: default 6 | 7 | process_docs: !function utils.process_docs 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | 12 | output_type: generate_until 13 | test_split: train 14 | 15 | generation_kwargs: 16 | until: [] 17 | do_sample: false # deterministic 18 | temperature: 0.6 19 | max_gen_toks: 32768 20 | 21 | repeats: 8 # 8 samples per problem 22 | 23 | # one catch-all slice — utils.process_results will compute cov/maj on it 24 | filter_list: 25 | - name: "all" 26 | filter: 27 | - function: "take_first_k" 28 | k: 8 29 | 30 | metric_list: 31 | - metric: exact_match # plain accuracy 32 | aggregation: mean 33 | higher_is_better: true 34 | 35 | - metric: cov@2 # oracle coverage over 8 36 | aggregation: mean 37 | higher_is_better: true 38 | 39 | - metric: maj@2 # majority-vote accuracy over 8 40 | aggregation: mean 41 | higher_is_better: true 42 | 43 | 44 | - metric: cov@4 # oracle coverage over 8 45 | aggregation: mean 46 | higher_is_better: true 47 | 48 | - metric: maj@4 # majority-vote accuracy over 8 49 | aggregation: mean 50 | higher_is_better: true 51 | 52 | 53 | - metric: cov@8 # oracle coverage over 8 54 | aggregation: mean 55 | higher_is_better: true 56 | 57 | - metric: maj@8 # majority-vote accuracy over 8 58 | aggregation: mean 59 | higher_is_better: true 60 | 61 | # two “bypass” metrics emitted by utils.process_results 62 | - metric: extracted_answers 63 | aggregation: bypass 64 | higher_is_better: true 65 | 66 | - metric: exact_matches 67 | aggregation: bypass 68 | higher_is_better: true 69 | 70 | metadata: 71 | version: 1.0 72 | -------------------------------------------------------------------------------- /lm_eval_files/aime/aime24_nofigures_maj8cov8.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: aime24_nofigures_maj8cov8 4 | dataset_path: simplescaling/aime24_nofigures 5 | dataset_name: default 6 | 7 | process_docs: !function utils.process_docs 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | 12 | output_type: generate_until 13 | test_split: train 14 | 15 | generation_kwargs: 16 | until: [] 17 | do_sample: false # deterministic 18 | temperature: 0.6 19 | max_gen_toks: 32768 20 | 21 | repeats: 8 # 8 samples per problem 22 | 23 | # one catch-all slice — utils.process_results will compute cov/maj on it 24 | filter_list: 25 | - name: "all" 26 | filter: 27 | - function: "take_first_k" 28 | k: 8 29 | 30 | metric_list: 31 | - metric: exact_match # plain accuracy 32 | aggregation: mean 33 | higher_is_better: true 34 | 35 | - metric: cov@2 # oracle coverage over 8 36 | aggregation: mean 37 | higher_is_better: true 38 | 39 | - metric: maj@2 # majority-vote accuracy over 8 40 | aggregation: mean 41 | higher_is_better: true 42 | 43 | 44 | - metric: cov@4 # oracle coverage over 8 45 | aggregation: mean 46 | higher_is_better: true 47 | 48 | - metric: maj@4 # majority-vote accuracy over 8 49 | aggregation: mean 50 | higher_is_better: true 51 | 52 | 53 | - metric: cov@8 # oracle coverage over 8 54 | aggregation: mean 55 | higher_is_better: true 56 | 57 | - metric: maj@8 # majority-vote accuracy over 8 58 | aggregation: mean 59 | higher_is_better: true 60 | 61 | # two “bypass” metrics emitted by utils.process_results 62 | - metric: extracted_answers 63 | aggregation: bypass 64 | higher_is_better: true 65 | 66 | - metric: exact_matches 67 | aggregation: bypass 68 | higher_is_better: true 69 | 70 | metadata: 71 | version: 1.0 -------------------------------------------------------------------------------- /training/open-r1/scripts/upload_details.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2025 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Push the details from a LightEval run to the Hub. 17 | 18 | Usage: 19 | 20 | python src/open_r1/utils/upload_details.py \ 21 | --data_files {path_to_parquet_file} \ 22 | --hub_repo_id {hub_repo_id} \ 23 | --config_name {config_name} 24 | """ 25 | 26 | from dataclasses import dataclass, field 27 | from typing import List 28 | 29 | from datasets import load_dataset 30 | from transformers import HfArgumentParser 31 | 32 | 33 | @dataclass 34 | class ScriptArguments: 35 | data_files: List[str] = field(default_factory=list) 36 | hub_repo_id: str = None 37 | config_name: str = None 38 | 39 | 40 | def main(): 41 | parser = HfArgumentParser(ScriptArguments) 42 | args = parser.parse_args_into_dataclasses()[0] 43 | 44 | if all(file.endswith(".json") for file in args.data_files): 45 | ds = load_dataset("json", data_files=args.data_files) 46 | elif all(file.endswith(".jsonl") for file in args.data_files): 47 | ds = load_dataset("json", data_files=args.data_files) 48 | else: 49 | ds = load_dataset("parquet", data_files=args.data_files) 50 | url = ds.push_to_hub(args.hub_repo_id, config_name=args.config_name, private=True) 51 | print(f"Dataset available at: {url}") 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /training/trl/trl/trainer/nash_md_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass, field 16 | 17 | from trl.trainer.online_dpo_config import OnlineDPOConfig 18 | 19 | 20 | @dataclass 21 | class NashMDConfig(OnlineDPOConfig): 22 | r""" 23 | Configuration class for the [`NashMDTrainer`]. 24 | 25 | Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following: 26 | 27 | Parameters: 28 | mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`): 29 | Logit mixture coefficient for the model and reference model. If a list of floats is provided then the 30 | mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the 31 | epochs. 32 | """ 33 | 34 | mixture_coef: list[float] = field( 35 | default_factory=lambda: [0.5], 36 | metadata={ 37 | "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided " 38 | "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the " 39 | "rest of the epochs." 40 | }, 41 | ) 42 | 43 | def __post_init__(self): 44 | super().__post_init__() 45 | if hasattr(self.mixture_coef, "__len__") and len(self.mixture_coef) == 1: 46 | self.mixture_coef = self.mixture_coef[0] 47 | -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math_agg64.yaml: -------------------------------------------------------------------------------- 1 | tag: 2 | - math_word_problems 3 | task: openai_math_agg64 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: test 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: [] 13 | do_sample: false 14 | temperature: 1 15 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 16 | repeats: 64 17 | filter_list: 18 | - name: "all" # Will do coverage, majority, and take_first_k 19 | filter: 20 | - function: "take_first_k" 21 | k: 64 22 | metric_list: 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true 26 | - metric: cov@64 27 | aggregation: mean 28 | higher_is_better: true 29 | - metric: cov@32 30 | aggregation: mean 31 | higher_is_better: true 32 | - metric: cov@16 33 | aggregation: mean 34 | higher_is_better: true 35 | - metric: cov@8 36 | aggregation: mean 37 | higher_is_better: true 38 | - metric: cov@4 39 | aggregation: mean 40 | higher_is_better: true 41 | - metric: cov@2 42 | aggregation: mean 43 | higher_is_better: true 44 | - metric: maj@64 45 | aggregation: mean 46 | higher_is_better: true 47 | - metric: maj@32 48 | aggregation: mean 49 | higher_is_better: true 50 | - metric: maj@16 51 | aggregation: mean 52 | higher_is_better: true 53 | - metric: maj@8 54 | aggregation: mean 55 | higher_is_better: true 56 | - metric: maj@4 57 | aggregation: mean 58 | higher_is_better: true 59 | - metric: maj@2 60 | aggregation: mean 61 | higher_is_better: true 62 | - metric: extracted_answers 63 | aggregation: bypass 64 | higher_is_better: true 65 | - metric: exact_matches 66 | aggregation: bypass 67 | higher_is_better: true 68 | metadata: 69 | version: 1.0 -------------------------------------------------------------------------------- /training/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/ioi 9 | dataset_prompt_column: problem 10 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 11 | 12 | # GRPO trainer config 13 | beta: 0.01 14 | bf16: true 15 | use_vllm: true 16 | do_eval: false 17 | gradient_accumulation_steps: 4 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 5.0e-06 24 | log_completions: true 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: cosine_with_min_lr 30 | lr_scheduler_kwargs: 31 | min_lr_rate: 0.1 32 | max_prompt_length: 1024 33 | max_completion_length: 2048 34 | max_steps: 500 35 | num_generations: 14 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO 38 | overwrite_output_dir: true 39 | per_device_train_batch_size: 16 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | save_strategy: "steps" 44 | save_steps: 50 45 | save_total_limit: 1 46 | seed: 42 47 | temperature: 1.0 48 | warmup_ratio: 0.03 49 | # ioi specific config 50 | code_language: cpp 51 | reward_funcs: 52 | - ioi_code 53 | - code_format 54 | - format 55 | reward_weights: 56 | - 1.0 57 | - 0.1 58 | - 0.1 59 | # for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating 60 | # otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions 61 | code_eval_test_batch_size: 3 -------------------------------------------------------------------------------- /lm_eval_files/openai_math/openai_math_maj64_cov64.yaml: -------------------------------------------------------------------------------- 1 | group: 2 | - math_word_problems 3 | task: openai_math_maj64_cov64 4 | dataset_path: simplescaling/openaimath 5 | process_docs: !function utils.process_docs 6 | output_type: generate_until 7 | test_split: test 8 | doc_to_text: !function utils.doc_to_text 9 | doc_to_target: answer 10 | process_results: !function utils.process_results 11 | generation_kwargs: 12 | until: [] 13 | do_sample: true 14 | temperature: 0.5 15 | max_gen_toks: 2048 # 2x of https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 16 | metric_list: 17 | - metric: exact_match 18 | aggregation: mean 19 | higher_is_better: true 20 | repeats: 64 21 | filter_list: 22 | - name: "score-first" # pick only the first response, and report metrics on that 23 | filter: 24 | - function: "take_first" 25 | - name: "maj@64" 26 | filter: 27 | - function: "majority_vote" 28 | - function: "take_first" 29 | - name: "maj@16" # get Maj@16, via selecting the first 8 responses. Using a better estimator would be optimal. 30 | filter: 31 | - function: "take_first_k" 32 | k: 16 33 | - function: "majority_vote" 34 | - function: "take_first" 35 | - name: "maj@32" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal. 36 | filter: 37 | - function: "take_first_k" 38 | k: 32 39 | - function: "majority_vote" 40 | - function: "take_first" 41 | - name: "maj@8" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal. 42 | filter: 43 | - function: "take_first_k" 44 | k: 8 45 | - function: "majority_vote" 46 | - function: "take_first" 47 | - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator. 48 | filter: 49 | - function: "take_first_k" 50 | k: 64 51 | - name: "cov@16" 52 | filter: 53 | - function: "take_first_k" 54 | k: 16 55 | - name: "cov@8" 56 | filter: 57 | - function: "take_first_k" 58 | k: 8 59 | metadata: 60 | version: 1.0 -------------------------------------------------------------------------------- /training/trl/examples/scripts/sft_gemma3.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Train Gemma-3 on the Codeforces COTS dataset. 17 | 18 | accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml examples/scripts/sft_gemma3.py 19 | """ 20 | 21 | from datasets import load_dataset 22 | from transformers import AutoModelForImageTextToText 23 | 24 | from trl import SFTConfig, SFTTrainer 25 | 26 | 27 | def main(): 28 | # Load dataset 29 | train_dataset = load_dataset("open-r1/codeforces-cots", split="train") 30 | train_dataset = train_dataset.remove_columns("prompt") 31 | 32 | # Load model 33 | model_id = "google/gemma-3-12b-it" 34 | model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager") 35 | 36 | # Train model 37 | training_args = SFTConfig( 38 | output_dir=f"{model_id}-codeforces-SFT", 39 | logging_steps=10, 40 | bf16=True, 41 | use_liger_kernel=True, 42 | gradient_checkpointing=True, 43 | gradient_checkpointing_kwargs={"use_reentrant": False}, 44 | max_length=8192, 45 | per_device_train_batch_size=1, 46 | gradient_accumulation_steps=8, 47 | dataset_num_proc=32, 48 | num_train_epochs=1, 49 | ) 50 | trainer = SFTTrainer( 51 | args=training_args, 52 | model=model, 53 | train_dataset=train_dataset, 54 | ) 55 | trainer.train() 56 | 57 | # Push to hub 58 | trainer.push_to_hub(dataset_name="open-r1/codeforces-cots") 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /annotated_dataset/annotation_statistics.csv: -------------------------------------------------------------------------------- 1 | qid,model,offload_chars,total_chars,offload_tokens,total_tokens,offload_tokens_per_offload_chars,offload_percentage 2 | 0,GPT4o,1694,7601,649,2797,46.357142857142854,22.29 3 | 1,GPT4o,2151,7948,536,2802,178.66666666666666,27.06 4 | 2,GPT4o,2661,7387,654,2330,654.0,36.02 5 | 3,GPT4o,4508,12079,1086,3362,543.0,37.32 6 | 4,GPT4o,20755,50830,6046,16003,863.7142857142857,40.83 7 | 5,GPT4o,20359,53514,5916,16353,394.4,38.04 8 | 6,GPT4o,2222,6133,659,1955,219.66666666666666,36.23 9 | 7,GPT4o,1796,6067,590,1907,34.705882352941174,29.60 10 | 8,GPT4o,1138,4058,396,1512,79.2,28.04 11 | 9,GPT4o,1328,4823,466,1849,77.66666666666667,27.53 12 | 10,GPT4o,9997,28348,4045,10348,139.48275862068965,35.27 13 | 11,GPT4o,11383,27390,4012,9572,401.2,41.56 14 | 12,GPT4o,2222,9725,877,3647,109.625,22.85 15 | 13,GPT4o,1773,6931,554,2646,55.4,25.58 16 | 14,GPT4o,2332,8405,948,3210,105.33333333333333,27.75 17 | 15,GPT4o,2546,11853,923,4225,307.6666666666667,21.48 18 | 16,GPT4o,10267,36730,3581,12565,716.2,27.95 19 | 17,GPT4o,5054,23599,2018,8501,672.6666666666666,21.42 20 | 18,GPT4o,3009,9190,974,3165,974.0,32.74 21 | 19,GPT4o,3183,12106,963,3900,321.0,26.29 22 | 20,GPT4o,5757,17304,1825,6325,608.3333333333334,33.27 23 | 21,GPT4o,5362,17201,1870,2740,935.0,31.17 24 | 0,Dpsr1,1755,7601,663,2797,44.2,23.09 25 | 1,Dpsr1,1767,7948,594,2802,66.0,22.23 26 | 2,Dpsr1,2872,7387,706,2330,353.0,38.88 27 | 3,Dpsr1,4866,12079,1166,3362,388.6666666666667,40.28 28 | 4,Dpsr1,9438,50830,2497,16003,624.25,18.57 29 | 5,Dpsr1,9886,53514,2539,16353,846.3333333333334,18.47 30 | 6,Dpsr1,2569,6133,768,1955,128.0,41.89 31 | 7,Dpsr1,2067,6067,590,1907,590.0,34.07 32 | 8,Dpsr1,1149,4058,391,1512,65.16666666666667,28.31 33 | 9,Dpsr1,1208,4823,428,1849,47.55555555555556,25.05 34 | 10,Dpsr1,6412,28348,2363,10348,236.3,22.62 35 | 11,Dpsr1,7152,27390,2545,9572,509.0,26.11 36 | 12,Dpsr1,2698,9725,1024,3647,85.33333333333333,27.74 37 | 13,Dpsr1,2000,6931,630,2646,630.0,28.86 38 | 14,Dpsr1,2254,8405,899,3210,149.83333333333334,26.82 39 | 15,Dpsr1,3002,11853,1125,4225,93.75,25.33 40 | 16,Dpsr1,6112,36730,2340,12565,137.64705882352942,16.64 41 | 17,Dpsr1,5963,23599,2399,8501,266.55555555555554,25.27 42 | 18,Dpsr1,3068,9190,988,3165,988.0,33.38 43 | 19,Dpsr1,3162,12106,951,3900,475.5,26.12 44 | 20,Dpsr1,5727,17304,1841,6325,263.0,33.10 45 | 21,Dpsr1,1982,17201,759,2740,27.107142857142858,11.52 -------------------------------------------------------------------------------- /training/trl/trl/templates/lm_model_card.md: -------------------------------------------------------------------------------- 1 | --- 2 | {{ card_data }} 3 | --- 4 | 5 | # Model Card for {{ model_name }} 6 | 7 | This model is a fine-tuned version of [{{ base_model }}](https://huggingface.co/{{ base_model }}){% if dataset_name %} on the [{{ dataset_name }}](https://huggingface.co/datasets/{{ dataset_name }}) dataset{% endif %}. 8 | It has been trained using [TRL](https://github.com/huggingface/trl). 9 | 10 | ## Quick start 11 | 12 | ```python 13 | from transformers import pipeline 14 | 15 | question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" 16 | generator = pipeline("text-generation", model="{{ hub_model_id }}", device="cuda") 17 | output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] 18 | print(output["generated_text"]) 19 | ``` 20 | 21 | ## Training procedure 22 | 23 | {% if wandb_url %}[Visualize in Weights & Biases]({{ wandb_url }}){% endif %} 24 | {% if comet_url %}[Visualize in Comet]({{ comet_url }}){% endif %} 25 | 26 | This model was trained with {{ trainer_name }}{% if paper_id %}, a method introduced in [{{ paper_title }}](https://huggingface.co/papers/{{ paper_id }}){% endif %}. 27 | 28 | ### Framework versions 29 | 30 | - TRL: {{ trl_version }} 31 | - Transformers: {{ transformers_version }} 32 | - Pytorch: {{ pytorch_version }} 33 | - Datasets: {{ datasets_version }} 34 | - Tokenizers: {{ tokenizers_version }} 35 | 36 | ## Citations 37 | 38 | {% if trainer_citation %}Cite {{ trainer_name }} as: 39 | 40 | ```bibtex 41 | {{ trainer_citation }} 42 | ```{% endif %} 43 | 44 | Cite TRL as: 45 | 46 | ```bibtex 47 | {% raw %}@misc{vonwerra2022trl, 48 | title = {{TRL: Transformer Reinforcement Learning}}, 49 | author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, 50 | year = 2020, 51 | journal = {GitHub repository}, 52 | publisher = {GitHub}, 53 | howpublished = {\url{https://github.com/huggingface/trl}} 54 | }{% endraw %} 55 | ``` 56 | -------------------------------------------------------------------------------- /training/trl/docker/trl-latest-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | # Builds GPU docker image of PyTorch 2 | # Uses multi-staged approach to reduce size 3 | # Stage 1 4 | # Use base conda image to reduce time 5 | FROM continuumio/miniconda3:latest AS compile-image 6 | # Specify py version 7 | ENV PYTHON_VERSION=3.10 8 | # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile 9 | RUN apt-get update && \ 10 | apt-get install -y curl git wget software-properties-common git-lfs && \ 11 | apt-get clean && \ 12 | rm -rf /var/lib/apt/lists* 13 | 14 | # Install audio-related libraries 15 | RUN apt-get update && \ 16 | apt install -y ffmpeg 17 | 18 | RUN apt install -y libsndfile1-dev 19 | RUN git lfs install 20 | 21 | # Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile 22 | RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip 23 | RUN python3 -m pip install --no-cache-dir --upgrade pip 24 | 25 | # Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile 26 | # We don't install pytorch here yet since CUDA isn't available 27 | # instead we use the direct torch wheel 28 | ENV PATH /opt/conda/envs/trl/bin:$PATH 29 | # Activate our bash shell 30 | RUN chsh -s /bin/bash 31 | SHELL ["/bin/bash", "-c"] 32 | 33 | # Stage 2 34 | FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image 35 | COPY --from=compile-image /opt/conda /opt/conda 36 | ENV PATH /opt/conda/bin:$PATH 37 | 38 | RUN chsh -s /bin/bash 39 | SHELL ["/bin/bash", "-c"] 40 | RUN source activate trl && \ 41 | python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq 42 | 43 | # Install apt libs 44 | RUN apt-get update && \ 45 | apt-get install -y curl git wget && \ 46 | apt-get clean && \ 47 | rm -rf /var/lib/apt/lists* 48 | 49 | # Activate the conda env and install transformers + accelerate from source 50 | RUN source activate trl && \ 51 | python3 -m pip install -U --no-cache-dir \ 52 | librosa \ 53 | "soundfile>=0.12.1" \ 54 | scipy \ 55 | transformers \ 56 | accelerate \ 57 | peft \ 58 | trl[test]@git+https://github.com/huggingface/trl 59 | 60 | RUN source activate trl && \ 61 | pip freeze | grep trl 62 | 63 | RUN echo "source activate trl" >> ~/.profile 64 | 65 | # Activate the virtualenv 66 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /training/open-r1/tests/slow/test_code_reward.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import unittest 17 | 18 | from datasets import load_dataset 19 | 20 | from open_r1.rewards import code_reward, ioi_code_reward 21 | 22 | 23 | class TestCodeRewards(unittest.TestCase): 24 | def test_python_code_reward(self): 25 | # requires E2B, see the README.md file 26 | code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested") 27 | NUM_SAMPLES = 20 28 | samples = code_dataset["train"].select(range(NUM_SAMPLES)) 29 | test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples] 30 | reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]} 31 | rewards = code_reward(test_completions, **reward_kwargs) 32 | print(rewards) 33 | assert rewards == [1.0] * NUM_SAMPLES 34 | 35 | def test_ioi_code_reward(self): 36 | # This slow test case requires spinning up a bunch (I tested with ~64) of piston workers, see docs here 37 | # slurm/piston/README.md 38 | code_dataset = load_dataset("open-r1/ioi-reward-test-dataset") 39 | NUM_SAMPLES = 16 40 | samples = code_dataset["train"].select(range(NUM_SAMPLES)) 41 | test_completions = [[{"content": f"```cpp\n{sample['sample_solution']}```"}] for sample in samples] 42 | keys = [key for key in samples[0] if key not in ["prompt", "completion"]] 43 | reward_kwargs = {key: [example[key] for example in samples] for key in keys} 44 | rewards = ioi_code_reward(test_completions, **reward_kwargs) 45 | print(rewards) 46 | assert rewards == [1.0] * NUM_SAMPLES 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /training/trl/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | .gitattributes 3 | .last_checked 4 | .gitconfig 5 | *.bak 6 | *.log 7 | *~ 8 | ~* 9 | _tmp* 10 | tmp* 11 | tags 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # dotenv 95 | .env 96 | 97 | # virtualenv 98 | .venv 99 | venv/ 100 | ENV/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | 115 | .vscode 116 | *.swp 117 | 118 | # osx generated files 119 | .DS_Store 120 | .DS_Store? 121 | .Trashes 122 | ehthumbs.db 123 | Thumbs.db 124 | .idea 125 | 126 | # pytest 127 | .pytest_cache 128 | 129 | # tools/trust-doc-nbs 130 | docs_src/.last_checked 131 | 132 | # symlinks to fastai 133 | docs_src/fastai 134 | tools/fastai 135 | 136 | # link checker 137 | checklink/cookies.txt 138 | 139 | # .gitconfig is now autogenerated 140 | .gitconfig 141 | 142 | # wandb files 143 | nbs/wandb/ 144 | examples/notebooks/wandb/ 145 | wandb/ -------------------------------------------------------------------------------- /training/open-r1/offload_read_graph.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def get_bigmodel_mask(text, open_tag="", close_tag=""): 6 | mask = [0] * len(text) 7 | start_index = 0 8 | 9 | while True: 10 | open_pos = text.find(open_tag, start_index) 11 | if open_pos == -1: 12 | break # no more openings 13 | 14 | close_pos = text.find(close_tag, open_pos + len(open_tag)) 15 | if close_pos == -1: 16 | # If we can't find a close tag, mark until the end of the text 17 | for i in range(open_pos, len(text)): 18 | mask[i] = 1 19 | break 20 | else: 21 | # Mark the region from ... 22 | region_end = close_pos + len(close_tag) 23 | for i in range(open_pos, region_end): 24 | mask[i] = 1 25 | start_index = region_end 26 | 27 | return mask 28 | 29 | def main(): 30 | # 1) Load the pickle file 31 | with open("bigmodel_span_text.pkl", "rb") as f: 32 | outputs = pickle.load(f) 33 | # 2) Create a 5x2 figure 34 | fig, axs = plt.subplots(5, 2, figsize=(14, 10)) 35 | axs = axs.flatten() 36 | 37 | # 3) For each of the 10 items in 'outputs', compute mask and plot 38 | for i in range(10): 39 | ax = axs[i] 40 | text = outputs[i] 41 | 42 | mask = get_bigmodel_mask(text) 43 | 44 | # If the text is empty or mask is empty, skip 45 | if not text or not mask: 46 | ax.set_title(f"Example {i} (no content)") 47 | ax.set_xticks([]) 48 | ax.set_yticks([]) 49 | continue 50 | 51 | # x from 0 to 1 across the character range 52 | x = [k / len(mask) for k in range(len(mask))] 53 | y = mask # 0/1 values 54 | # calculate coverage (percentage of 1s) 55 | coverage = 100.0 * sum(mask) / len(mask) 56 | 57 | # Plot with step 58 | ax.step(x, y, where='post') 59 | ax.set_ylim(-0.1, 1.1) # 0 or 1 only 60 | ax.set_xlim(0, 1) 61 | ax.set_yticks([]) 62 | ax.set_title(f"Example {i} ({coverage:.1f}% covered)") 63 | 64 | plt.tight_layout() 65 | plt.savefig("switch_behavior.pdf") 66 | plt.clf() # Clear the figure from memory 67 | plt.close() # Close the plotting window 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /modes/placeholder.py: -------------------------------------------------------------------------------- 1 | # modes/placeholder.py 2 | 3 | from pprint import pprint 4 | import os 5 | import datetime 6 | 7 | 8 | def run_placeholder_flow( 9 | question, 10 | big_model, 11 | big_model_port, 12 | small_model, 13 | small_model_port, 14 | generate_text_vllm, 15 | max_tokens=1024, 16 | temperature=0.7, 17 | test_logging: bool = False, 18 | ): 19 | """ 20 | A baseline 'placeholder' flow: we just send a single request to the 21 | *big_model* and return it as a final answer, plus usage data. 22 | """ 23 | usage_data = [] 24 | 25 | # Basic prompt 26 | if "|" not in question: 27 | prompt = f"<|begin▁of▁sentence|><|User|>{question}<|Assistant|>" 28 | else: 29 | prompt = f"{question}" 30 | 31 | print("Sending request to big model") 32 | # Single big model request 33 | resp_json, latency_big = generate_text_vllm( 34 | prompt, 35 | port=big_model_port, 36 | temperature=temperature, 37 | max_tokens=max_tokens, 38 | model=big_model 39 | ) 40 | usage_dict_big = resp_json.get("usage", {}) 41 | final_reply = resp_json["choices"][0]["text"] 42 | 43 | # Single big model request 44 | resp_json, latency_small = generate_text_vllm( 45 | prompt, 46 | port=small_model_port, 47 | temperature=temperature, 48 | max_tokens=max_tokens, 49 | model=small_model 50 | ) 51 | 52 | usage_dict_small = resp_json.get("usage", {}) 53 | 54 | usage_data.append({ 55 | "Model": big_model, 56 | "ThinkIter": "placeholder", 57 | "DraftVersion": 0, 58 | "PromptTokens": usage_dict_big.get("prompt_tokens", 0), # Always expect this item. 59 | "CompletionTokens": usage_dict_big.get("completion_tokens", 0), # Always expect this item. 60 | "Latency": latency_big, # Always expect this item. 61 | "ModelSmall": small_model, 62 | "PromptTokensSmall": usage_dict_small.get("prompt_tokens", 0), 63 | "CompletionTokensSmall": usage_dict_small.get("completion_tokens", 0), 64 | "LatencySmall": latency_small 65 | 66 | }) 67 | 68 | pprint(usage_data) 69 | final_reply_small = resp_json["choices"][0]["text"] 70 | print("Final reply from small model:\n\n", final_reply_small) 71 | print("\n\nFinal reply from big model:\n\n", final_reply) 72 | return final_reply, usage_data 73 | -------------------------------------------------------------------------------- /training/trl/docker/trl-source-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | # Builds GPU docker image of PyTorch 2 | # Uses multi-staged approach to reduce size 3 | # Stage 1 4 | # Use base conda image to reduce time 5 | FROM continuumio/miniconda3:latest AS compile-image 6 | # Specify py version 7 | ENV PYTHON_VERSION=3.10 8 | # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile 9 | RUN apt-get update && \ 10 | apt-get install -y curl git wget software-properties-common git-lfs && \ 11 | apt-get clean && \ 12 | rm -rf /var/lib/apt/lists* 13 | 14 | # Install audio-related libraries 15 | RUN apt-get update && \ 16 | apt install -y ffmpeg 17 | 18 | RUN apt install -y libsndfile1-dev 19 | RUN git lfs install 20 | 21 | # Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile 22 | RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip 23 | RUN python3 -m pip install --no-cache-dir --upgrade pip 24 | 25 | # Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile 26 | # We don't install pytorch here yet since CUDA isn't available 27 | # instead we use the direct torch wheel 28 | ENV PATH /opt/conda/envs/trl/bin:$PATH 29 | # Activate our bash shell 30 | RUN chsh -s /bin/bash 31 | SHELL ["/bin/bash", "-c"] 32 | 33 | # Stage 2 34 | FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image 35 | COPY --from=compile-image /opt/conda /opt/conda 36 | ENV PATH /opt/conda/bin:$PATH 37 | 38 | RUN chsh -s /bin/bash 39 | SHELL ["/bin/bash", "-c"] 40 | RUN source activate trl && \ 41 | python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq 42 | 43 | # Install apt libs 44 | RUN apt-get update && \ 45 | apt-get install -y curl git wget && \ 46 | apt-get clean && \ 47 | rm -rf /var/lib/apt/lists* 48 | 49 | # Activate the conda env and install transformers + accelerate from source 50 | RUN source activate trl && \ 51 | python3 -m pip install -U --no-cache-dir \ 52 | librosa \ 53 | "soundfile>=0.12.1" \ 54 | scipy \ 55 | git+https://github.com/huggingface/transformers \ 56 | git+https://github.com/huggingface/accelerate \ 57 | git+https://github.com/huggingface/peft \ 58 | trl[test]@git+https://github.com/huggingface/trl 59 | 60 | RUN source activate trl && \ 61 | pip freeze | grep transformers 62 | 63 | RUN echo "source activate trl" >> ~/.profile 64 | 65 | # Activate the virtualenv 66 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /training/trl/tests/test_rich_progress_callback.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import tempfile 16 | import unittest 17 | 18 | import torch 19 | import torch.nn as nn 20 | from datasets import Dataset 21 | from transformers import Trainer, TrainingArguments 22 | 23 | from trl.trainer.callbacks import RichProgressCallback 24 | 25 | 26 | class DummyModel(nn.Module): 27 | def __init__(self): 28 | super().__init__() 29 | self.a = nn.Parameter(torch.tensor(1.0)) 30 | 31 | def forward(self, x): 32 | return self.a * x 33 | 34 | 35 | class TestRichProgressCallback(unittest.TestCase): 36 | def setUp(self): 37 | self.dummy_model = DummyModel() 38 | self.dummy_train_dataset = Dataset.from_list([{"x": 1.0, "y": 2.0}] * 5) 39 | self.dummy_val_dataset = Dataset.from_list([{"x": 1.0, "y": 2.0}] * 101) 40 | 41 | def test_rich_progress_callback_logging(self): 42 | with tempfile.TemporaryDirectory() as tmp_dir: 43 | training_args = TrainingArguments( 44 | output_dir=tmp_dir, 45 | per_device_eval_batch_size=2, 46 | per_device_train_batch_size=2, 47 | num_train_epochs=4, 48 | eval_strategy="steps", 49 | eval_steps=1, 50 | logging_strategy="steps", 51 | logging_steps=1, 52 | save_strategy="no", 53 | report_to="none", 54 | disable_tqdm=True, 55 | ) 56 | callbacks = [RichProgressCallback()] 57 | trainer = Trainer( 58 | model=self.dummy_model, 59 | train_dataset=self.dummy_train_dataset, 60 | eval_dataset=self.dummy_val_dataset, 61 | args=training_args, 62 | callbacks=callbacks, 63 | ) 64 | 65 | trainer.train() 66 | trainer.train() 67 | -------------------------------------------------------------------------------- /training/trl/examples/research_projects/stack_llama_2/scripts/README.md: -------------------------------------------------------------------------------- 1 | # DPO pipeline for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model 2 | 3 | ## Prerequisites 4 | 5 | Install all the dependencies in the `requirements.txt`: 6 | 7 | ``` 8 | $ pip install -U -r requirements.txt 9 | ``` 10 | 11 | Since we will use `accelerate` for training, make sure to run: 12 | ``` 13 | $ accelerate config 14 | ``` 15 | 16 | ## Training 17 | 18 | There were two main steps to the DPO training process: 19 | 1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se: 20 | 21 | ``` 22 | accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \ 23 | --output_dir="./sft" \ 24 | --max_steps=500 \ 25 | --logging_steps=10 \ 26 | --save_steps=10 \ 27 | --per_device_train_batch_size=4 \ 28 | --per_device_eval_batch_size=1 \ 29 | --gradient_accumulation_steps=2 \ 30 | --gradient_checkpointing=False \ 31 | --group_by_length=False \ 32 | --learning_rate=1e-4 \ 33 | --lr_scheduler_type="cosine" \ 34 | --warmup_steps=100 \ 35 | --weight_decay=0.05 \ 36 | --optim="paged_adamw_32bit" \ 37 | --bf16=True \ 38 | --remove_unused_columns=False \ 39 | --run_name="sft_llama2" \ 40 | --report_to="wandb" 41 | ``` 42 | 1. Run the DPO trainer using the model saved by the previous step: 43 | ``` 44 | accelerate launch examples/research_projects/stack_llama_2/scripts/dpo_llama2.py \ 45 | --model_name_or_path="sft/final_checkpoint" \ 46 | --output_dir="dpo" 47 | ``` 48 | 49 | 50 | ## Merging the adaptors 51 | 52 | To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL: 53 | 54 | ``` 55 | python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2" 56 | ``` 57 | 58 | which will also push the model to your HuggingFace hub account. 59 | 60 | ## Running the model 61 | 62 | We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via: 63 | 64 | ```py 65 | from peft import AutoPeftModelForCausalLM 66 | 67 | 68 | model = AutoPeftModelForCausalLM.from_pretrained( 69 | "dpo/final_checkpoint", 70 | low_cpu_mem_usage=True, 71 | torch_dtype=torch.float16, 72 | load_in_4bit=True, 73 | ) 74 | 75 | model.generate(...) 76 | ``` 77 | -------------------------------------------------------------------------------- /training/open-r1/scripts/run_benchmarks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from dataclasses import dataclass, field 15 | from typing import List, Optional 16 | 17 | from open_r1.utils.evaluation import SUPPORTED_BENCHMARKS, run_benchmark_jobs 18 | from open_r1.configs import SFTConfig 19 | from trl import ModelConfig, TrlParser 20 | 21 | 22 | @dataclass 23 | class ScriptArguments: 24 | model_id: str = field( 25 | default="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 26 | metadata={"help": "The Hub model id to push the model to."}, 27 | ) 28 | model_revision: str = field(default="main", metadata={"help": "The Hub model branch to push the model to."}) 29 | trust_remote_code: bool = field(default=False, metadata={"help": "Trust the remote code."}) 30 | benchmarks: List[str] = field( 31 | default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."} 32 | ) 33 | list_benchmarks: bool = field(default=False, metadata={"help": "List all supported benchmarks."}) 34 | system_prompt: Optional[str] = field( 35 | default=None, metadata={"help": "The system prompt to use for the benchmark."} 36 | ) 37 | 38 | 39 | def main(): 40 | parser = TrlParser(ScriptArguments) 41 | args = parser.parse_args_and_config()[0] 42 | if args.list_benchmarks: 43 | print("Supported benchmarks:") 44 | for benchmark in SUPPORTED_BENCHMARKS: 45 | print(f" - {benchmark}") 46 | return 47 | benchmark_args = SFTConfig( 48 | output_dir="", 49 | hub_model_id=args.model_id, 50 | hub_model_revision=args.model_revision, 51 | benchmarks=args.benchmarks, 52 | system_prompt=args.system_prompt, 53 | ) 54 | run_benchmark_jobs( 55 | benchmark_args, 56 | ModelConfig(model_name_or_path="", model_revision="", trust_remote_code=args.trust_remote_code), 57 | ) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /lm_eval_files/aime/README.md: -------------------------------------------------------------------------------- 1 | # GSM8k 2 | 3 | ## Paper 4 | Training Verifiers to Solve Math Word Problems 5 | https://arxiv.org/abs/2110.14168 6 | 7 | State-of-the-art language models can match human performance on many tasks, but 8 | they still struggle to robustly perform multi-step mathematical reasoning. To 9 | diagnose the failures of current models and support research, we introduce GSM8K, 10 | a dataset of 8.5K high quality linguistically diverse grade school math word problems. 11 | We find that even the largest transformer models fail to achieve high test performance, 12 | despite the conceptual simplicity of this problem distribution. 13 | 14 | NOTE: See the official implementation of the task: 15 | https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py 16 | for how to make use of the dataset's calculator annotations in your language 17 | model's sample/generation function. 18 | 19 | Homepage: https://github.com/openai/grade-school-math 20 | 21 | 22 | ## Citation 23 | ``` 24 | @misc{cobbe2021training, 25 | title={Training Verifiers to Solve Math Word Problems}, 26 | author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, 27 | year={2021}, 28 | eprint={2110.14168}, 29 | archivePrefix={arXiv}, 30 | primaryClass={cs.LG} 31 | } 32 | ``` 33 | 34 | ### Groups and Tasks 35 | 36 | #### Groups 37 | 38 | - `math_word_problems` 39 | - `chain_of_thought` 40 | - `self_consistency` 41 | 42 | #### Tasks 43 | 44 | - `gsm8k_yaml` 45 | - `gsm8k_cot`: GSM8K with Chain-of-Thought 46 | - `gsm8k_cot_self_consistency`: GSM8K with Chain-of-Thought and Self-Consistency 47 | - `gsm8k_cot_llama`: GSM8K with prompt formatting modified to conform to the evaluation settings described by Meta here: https://huggingface.co/datasets/meta-llama/Meta-Llama-3.1-8B-Instruct-evals/viewer/Meta-Llama-3.1-8B-Instruct-evals__gsm8k__details?row=0 48 | - Use this task with --fewshot_as_multiturn and --apply_chat_template to replicate Meta's reported performance. 49 | 50 | 51 | ### Checklist 52 | 53 | - [x] Is in Eval-harness v1.0 ? 54 | - [ ] Has been checked for regression from v1.0? 55 | - [ ] Has been checked for equivalence with original paper methodology? 56 | - [ ] "Main" checked variant clearly denoted? 57 | 58 | ### Variant Wishlist 59 | 60 | - [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation) 61 | - [ ] Using Verifiers 62 | - [ ] Majority voting "without CoT" 63 | -------------------------------------------------------------------------------- /lm_eval_files/openai/gpqa_diamond_openai.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: Idavidrein/gpqa 2 | tag: gpqa 3 | dataset_name: gpqa_diamond 4 | task: gpqa_diamond_openai 5 | output_type: generate_until 6 | process_docs: !function utils.process_docs 7 | process_results: !function utils.process_results 8 | training_split: train 9 | # Because huggingface dataset only has train split 10 | validation_split: train 11 | test_split: null 12 | # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/common.py#L12 13 | # doc_to_text: "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering Think step by step before answering always provide a final answer within the word limit. \n\n{{Question}}\n\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}" 14 | # doc_to_text: "Answer the following multiple-choice question. Your response must adhere to these rules: 15 | # 1. Think step by step to arrive at the correct answer. 16 | # 2. Avoid repeating reasoning or steps already stated. 17 | # 3. Ensure your response is within the word limit. 18 | # 4. Conclude with the final answer in the format: 'Answer: $LETTER' (without quotes), where LETTER is one of ABCD. 19 | 20 | # {{Question}} 21 | 22 | # A) {{choice1}} 23 | # B) {{choice2}} 24 | # C) {{choice3}} 25 | # D) {{choice4}}" 26 | # doc_to_text: "{{Question}}\nAnswer Choices: (A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nOutput your final answer in boxes, such as \\boxed{A}" 27 | # Original prompt from Qwq team: 28 | # doc_to_text: "{{Question}}\nAnswer Choices: (A) {{choice1}} (B) {{choice2}} (C) {{choice3}} (D) {{choice4}}\nOutput your final answer in boxes, such as \\boxed{A}." 29 | # doc_to_text: "{{Question}}\n\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}" 30 | doc_to_text: !function utils.doc_to_text_gpqa 31 | doc_to_target: answer 32 | num_fewshot: 0 33 | generation_kwargs: 34 | until: [] 35 | do_sample: false 36 | temperature: 0 # Do 0.5? https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L26 37 | max_gen_toks: 1024 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 38 | metric_list: 39 | - metric: exact_match 40 | aggregation: mean 41 | higher_is_better: true 42 | - metric: extracted_answers 43 | aggregation: bypass 44 | higher_is_better: true 45 | metadata: 46 | version: 1.0 47 | -------------------------------------------------------------------------------- /lm_eval_files/openai/gpqa_diamond_openai_maj64_cov64.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: Idavidrein/gpqa 2 | tag: gpqa 3 | dataset_name: gpqa_diamond 4 | task: gpqa_diamond_openai_maj64_cov64_train 5 | output_type: generate_until 6 | process_docs: !function utils.process_docs 7 | process_results: !function utils.process_results 8 | training_split: train 9 | # Because huggingface dataset only has train split 10 | validation_split: train 11 | test_split: null 12 | # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/common.py#L12 13 | doc_to_text: "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering but always provide a final answer within the word limit.\n\n{{Question}}\n\nA) {{choice1}}\nB) {{choice2}}\nC) {{choice3}}\nD) {{choice4}}" 14 | doc_to_target: answer 15 | num_fewshot: 0 16 | generation_kwargs: 17 | until: [] 18 | do_sample: false 19 | temperature: 0.5 # Do 0.5? https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L26 20 | max_gen_toks: 1024 # https://github.com/openai/simple-evals/blob/c0dba4c7bfbc17f786aec7bd7c3585a36ad81f23/sampler/chat_completion_sampler.py#L27 21 | metric_list: 22 | - metric: exact_match 23 | aggregation: mean 24 | higher_is_better: true 25 | repeats: 64 26 | filter_list: 27 | - name: "score-first" # pick only the first response, and report metrics on that 28 | filter: 29 | - function: "take_first" 30 | - name: "maj@64" 31 | filter: 32 | - function: "majority_vote" 33 | - function: "take_first" 34 | - name: "maj@16" # get Maj@16, via selecting the first 8 responses. Using a better estimator would be optimal. 35 | filter: 36 | - function: "take_first_k" 37 | k: 16 38 | - function: "majority_vote" 39 | - function: "take_first" 40 | - name: "maj@8" # get Maj@8 , via selecting the first 8 responses. Using a better estimator would be optimal. 41 | filter: 42 | - function: "take_first_k" 43 | k: 8 44 | - function: "majority_vote" 45 | - function: "take_first" 46 | - name: "cov@64" # get coverage@64 , via allowing all 64 samples and then picking only the correct one in the evaluator. 47 | filter: 48 | - function: "take_first_k" 49 | k: 64 50 | - name: "cov@16" 51 | filter: 52 | - function: "take_first_k" 53 | k: 16 54 | - name: "cov@8" 55 | filter: 56 | - function: "take_first_k" 57 | k: 8 58 | metadata: 59 | version: 1.0 60 | -------------------------------------------------------------------------------- /training/trl/trl/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from ..import_utils import OptionalDependencyNotAvailable, _LazyModule, is_diffusers_available 18 | 19 | 20 | _import_structure = { 21 | "modeling_base": ["GeometricMixtureWrapper", "PreTrainedModelWrapper", "create_reference_model"], 22 | "modeling_value_head": ["AutoModelForCausalLMWithValueHead", "AutoModelForSeq2SeqLMWithValueHead"], 23 | "utils": [ 24 | "SUPPORTED_ARCHITECTURES", 25 | "prepare_deepspeed", 26 | "prepare_fsdp", 27 | "setup_chat_format", 28 | "unwrap_model_for_generation", 29 | ], 30 | } 31 | 32 | try: 33 | if not is_diffusers_available(): 34 | raise OptionalDependencyNotAvailable() 35 | except OptionalDependencyNotAvailable: 36 | pass 37 | else: 38 | _import_structure["modeling_sd_base"] = [ 39 | "DDPOPipelineOutput", 40 | "DDPOSchedulerOutput", 41 | "DDPOStableDiffusionPipeline", 42 | "DefaultDDPOStableDiffusionPipeline", 43 | ] 44 | 45 | if TYPE_CHECKING: 46 | from .modeling_base import GeometricMixtureWrapper, PreTrainedModelWrapper, create_reference_model 47 | from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead 48 | from .utils import ( 49 | SUPPORTED_ARCHITECTURES, 50 | prepare_deepspeed, 51 | prepare_fsdp, 52 | setup_chat_format, 53 | unwrap_model_for_generation, 54 | ) 55 | 56 | try: 57 | if not is_diffusers_available(): 58 | raise OptionalDependencyNotAvailable() 59 | except OptionalDependencyNotAvailable: 60 | pass 61 | else: 62 | from .modeling_sd_base import ( 63 | DDPOPipelineOutput, 64 | DDPOSchedulerOutput, 65 | DDPOStableDiffusionPipeline, 66 | DefaultDDPOStableDiffusionPipeline, 67 | ) 68 | else: 69 | import sys 70 | 71 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 72 | -------------------------------------------------------------------------------- /training/trl/docs/source/judges.md: -------------------------------------------------------------------------------- 1 | # Judges 2 | 3 | 4 | 5 | TRL Judges is an experimental API which is subject to change at any time. 6 | 7 | 8 | 9 | TRL provides judges to easily compare two completions. 10 | 11 | Make sure to have installed the required dependencies by running: 12 | 13 | ```bash 14 | pip install trl[judges] 15 | ``` 16 | 17 | ## Using the provided judges 18 | 19 | TRL provides several judges out of the box. For example, you can use the `HfPairwiseJudge` to compare two completions using a pre-trained model from the Hugging Face model hub: 20 | 21 | ```python 22 | from trl import HfPairwiseJudge 23 | 24 | judge = HfPairwiseJudge() 25 | judge.judge( 26 | prompts=["What is the capital of France?", "What is the biggest planet in the solar system?"], 27 | completions=[["Paris", "Lyon"], ["Saturn", "Jupiter"]], 28 | ) # Outputs: [0, 1] 29 | ``` 30 | 31 | ## Define your own judge 32 | 33 | To define your own judge, we provide several base classes that you can subclass. For rank-based judges, you need to subclass [`BaseRankJudge`] and implement the [`BaseRankJudge.judge`] method. For pairwise judges, you need to subclass [`BasePairJudge`] and implement the [`BasePairJudge.judge`] method. If you want to define a judge that doesn't fit into these categories, you need to subclass [`BaseJudge`] and implement the [`BaseJudge.judge`] method. 34 | 35 | As an example, let's define a pairwise judge that prefers shorter completions: 36 | 37 | ```python 38 | from trl import BasePairwiseJudge 39 | 40 | class PrefersShorterJudge(BasePairwiseJudge): 41 | def judge(self, prompts, completions, shuffle_order=False): 42 | return [0 if len(completion[0]) > len(completion[1]) else 1 for completion in completions] 43 | ``` 44 | 45 | You can then use this judge as follows: 46 | 47 | ```python 48 | judge = PrefersShorterJudge() 49 | judge.judge( 50 | prompts=["What is the capital of France?", "What is the biggest planet in the solar system?"], 51 | completions=[["Paris", "The capital of France is Paris."], ["Jupiter is the biggest planet in the solar system.", "Jupiter"]], 52 | ) # Outputs: [0, 1] 53 | ``` 54 | 55 | ## Provided judges 56 | 57 | ### PairRMJudge 58 | 59 | [[autodoc]] PairRMJudge 60 | 61 | ### HfPairwiseJudge 62 | 63 | [[autodoc]] HfPairwiseJudge 64 | 65 | ### OpenAIPairwiseJudge 66 | 67 | [[autodoc]] OpenAIPairwiseJudge 68 | 69 | ### AllTrueJudge 70 | 71 | [[autodoc]] AllTrueJudge 72 | 73 | ## Base classes 74 | 75 | ### BaseJudge 76 | 77 | [[autodoc]] BaseJudge 78 | 79 | ### BaseBinaryJudge 80 | 81 | [[autodoc]] BaseBinaryJudge 82 | 83 | ### BaseRankJudge 84 | 85 | [[autodoc]] BaseRankJudge 86 | 87 | ### BasePairwiseJudge 88 | 89 | [[autodoc]] BasePairwiseJudge 90 | -------------------------------------------------------------------------------- /training/trl/docs/source/use_model.md: -------------------------------------------------------------------------------- 1 | # Use model after training 2 | 3 | Once you have trained a model using either the SFTTrainer, PPOTrainer, or DPOTrainer, you will have a fine-tuned model that can be used for text generation. In this section, we'll walk through the process of loading the fine-tuned model and generating text. If you need to run an inference server with the trained model, you can explore libraries such as [`text-generation-inference`](https://github.com/huggingface/text-generation-inference). 4 | 5 | ## Load and Generate 6 | 7 | If you have fine-tuned a model fully, meaning without the use of PEFT you can simply load it like any other language model in transformers. E.g. the value head that was trained during the PPO training is no longer needed and if you load the model with the original transformer class it will be ignored: 8 | 9 | ```python 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | 12 | model_name_or_path = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub 13 | device = "cpu" # or "cuda" if you have a GPU 14 | 15 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device) 16 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 17 | 18 | inputs = tokenizer.encode("This movie was really", return_tensors="pt").to(device) 19 | outputs = model.generate(inputs) 20 | print(tokenizer.decode(outputs[0])) 21 | ``` 22 | 23 | Alternatively you can also use the pipeline: 24 | 25 | ```python 26 | from transformers import pipeline 27 | 28 | model_name_or_path = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub 29 | pipe = pipeline("text-generation", model=model_name_or_path) 30 | print(pipe("This movie was really")[0]["generated_text"]) 31 | ``` 32 | 33 | ## Use Adapters PEFT 34 | 35 | ```python 36 | from peft import PeftConfig, PeftModel 37 | from transformers import AutoModelForCausalLM, AutoTokenizer 38 | 39 | base_model_name = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub" 40 | adapter_model_name = "path/to/my/adapter" 41 | 42 | model = AutoModelForCausalLM.from_pretrained(base_model_name) 43 | model = PeftModel.from_pretrained(model, adapter_model_name) 44 | 45 | tokenizer = AutoTokenizer.from_pretrained(base_model_name) 46 | ``` 47 | 48 | You can also merge the adapters into the base model so you can use the model like a normal transformers model, however the checkpoint will be significantly bigger: 49 | 50 | ```python 51 | model = AutoModelForCausalLM.from_pretrained(base_model_name) 52 | model = PeftModel.from_pretrained(model, adapter_model_name) 53 | 54 | model = model.merge_and_unload() 55 | model.save_pretrained("merged_adapters") 56 | ``` 57 | 58 | Once you have the model loaded and either merged the adapters or keep them separately on top you can run generation as with a normal model outlined above. 59 | -------------------------------------------------------------------------------- /modes/big_model_only.py: -------------------------------------------------------------------------------- 1 | # modes/big_model_only.py 2 | 3 | from pprint import pprint 4 | import os 5 | import datetime 6 | 7 | import time 8 | import uuid 9 | 10 | benchfile = "specR_big.csv" 11 | 12 | def run_bigmodel_flow( 13 | question, 14 | big_model, 15 | big_model_port, 16 | generate_text_vllm, 17 | terminating_string: str, 18 | max_tokens=1024, 19 | temperature=0.6, 20 | sequential_scale=0, 21 | test_logging: bool = False, 22 | token_counter=None, 23 | ): 24 | """ 25 | A baseline 'placeholder' flow: we just send a single request to the 26 | *big_model* and return it as a final answer, plus usage data. 27 | """ 28 | usage_data = [] 29 | 30 | model_think_prefix = "\n" 31 | model_think_suffix = "" 32 | 33 | start_time = time.time() 34 | def _clean(t): # strip special markers 35 | for s in ("<|User|>", "<|Assistant|>", "<|begin▁of▁sentence|>", 36 | "<|end▁of▁sentence|>", ""): 37 | t = t.replace(s, "") 38 | return t 39 | 40 | sequential_iter = 0 # Remove sequential-iter support 41 | if sequential_iter == 0: 42 | big_hint = "" 43 | term_str = "\n Put your final answer within \\boxed{}." 44 | cur = (f"<|begin▁of▁sentence|><|User|>{_clean(question)}\n" 45 | f"{big_hint}{term_str}<|Assistant|>\n\n") 46 | prompt = cur 47 | 48 | resp_json, latency = generate_text_vllm( 49 | prompt, 50 | port=big_model_port, 51 | temperature=temperature, 52 | max_tokens=8192, 53 | model=big_model 54 | ) 55 | final_reply = resp_json["choices"][0]["text"] 56 | # final_reply = f"{prompt}{final_reply}" 57 | final_reply = f"{final_reply}" 58 | total_time = time.time() - start_time 59 | total_tokens = token_counter(final_reply) if token_counter else len(final_reply.split()) 60 | time_per_tok = total_time / total_tokens if total_tokens > 0 else 0 61 | uuid_ = str(uuid.uuid4()) 62 | current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 63 | try: 64 | if not os.path.exists(benchfile): 65 | with open(benchfile, "w") as f: 66 | f.write( 67 | "uuid,big_model,sequential_scale,total_tokens," 68 | "total_time,time_per_tok,datetime\n" 69 | ) 70 | with open(benchfile, "a") as f: 71 | f.write( 72 | f"{uuid_},{big_model},{sequential_scale}," 73 | f"{total_tokens},{total_time},{time_per_tok},{current_time}\n" 74 | ) 75 | except Exception as e: 76 | print(f"Error writing to file: {e}") 77 | print("Please check if the file path is correct and if you have write permissions.") 78 | pass 79 | return final_reply, usage_data 80 | -------------------------------------------------------------------------------- /training/trl/docs/source/distributing_training.md: -------------------------------------------------------------------------------- 1 | # Distributing Training 2 | 3 | 4 | Section under construction. Feel free to contribute! 5 | 6 | 7 | ## Multi-GPU Training with TRL 8 | 9 | The trainers in TRL use [🤗 Accelerate](https://github.com/huggingface/accelerate) to enable distributed training across multiple GPUs or nodes. To do so, first create an [🤗 Accelerate](https://github.com/huggingface/accelerate) config file by running 10 | 11 | ```bash 12 | accelerate config 13 | ``` 14 | 15 | and answering the questions according to your multi-GPU / multi-node setup. You can then launch distributed training by running: 16 | 17 | ```bash 18 | accelerate launch train.py 19 | ``` 20 | 21 | We also provide config files in the [examples folder](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.: 22 | 23 | ```shell 24 | accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train.py 25 | ``` 26 | 27 | This automatically distributes the workload across all available GPUs. 28 | 29 | Under the hood, [🤗 Accelerate](https://github.com/huggingface/accelerate) creates one model per GPU. Each process: 30 | - Processes its own batch of data 31 | - Computes the loss and gradients for that batch 32 | - Shares gradient updates across all GPUs 33 | 34 | ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png) 35 | 36 | The effective batch size is calculated as: 37 | 38 | $$ 39 | \text{Batch Size} = \text{per\_device\_train\_batch\_size} \times \text{num\_devices} \times \text{gradient\_accumulation\_steps} 40 | $$ 41 | 42 | To maintain a consistent batch size when scaling to multiple GPUs, make sure to update `per_device_train_batch_size` and `gradient_accumulation_steps` accordingly. 43 | 44 | Example, these configurations are equivalent, and should yield the same results: 45 | 46 | | Number of GPUs | Per device batch size | Gradient accumulation steps | Comments | 47 | | --- | --- | --- | --- | 48 | | 1 | 32 | 1 | Possibly high memory usage, but faster training | 49 | | 1 | 4 | 8 | Lower memory usage, slower training | 50 | | 8 | 4 | 1 | Multi-GPU to get the best of both worlds | 51 | 52 | 53 | 54 | Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage [DeepSpeed](https://github.com/deepspeedai/DeepSpeed), which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our [DeepSpeed Integration](deepspeed_integration.md) guide for more details. 55 | 56 | 57 | 58 | ## Multi-Nodes Training 59 | 60 | We're working on a guide for multi-node training. Stay tuned! 🚀 -------------------------------------------------------------------------------- /training/trl/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass, field 16 | from typing import Optional 17 | 18 | import torch 19 | from peft import PeftConfig, PeftModel 20 | from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser 21 | 22 | 23 | @dataclass 24 | class ScriptArguments: 25 | """ 26 | The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the 27 | merged model. 28 | """ 29 | 30 | adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"}) 31 | base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"}) 32 | output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"}) 33 | 34 | 35 | parser = HfArgumentParser(ScriptArguments) 36 | script_args = parser.parse_args_into_dataclasses()[0] 37 | assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge" 38 | assert script_args.base_model_name is not None, "please provide the name of the Base model" 39 | assert script_args.output_name is not None, "please provide the output name of the merged model" 40 | 41 | peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name) 42 | if peft_config.task_type == "SEQ_CLS": 43 | # The sequence classification task is used for the reward model in PPO 44 | model = AutoModelForSequenceClassification.from_pretrained( 45 | script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16 46 | ) 47 | else: 48 | model = AutoModelForCausalLM.from_pretrained( 49 | script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16 50 | ) 51 | 52 | tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name) 53 | 54 | # Load the PEFT model 55 | model = PeftModel.from_pretrained(model, script_args.adapter_model_name) 56 | model.eval() 57 | 58 | model = model.merge_and_unload() 59 | 60 | model.save_pretrained(f"{script_args.output_name}") 61 | tokenizer.save_pretrained(f"{script_args.output_name}") 62 | model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False) 63 | -------------------------------------------------------------------------------- /modes/small_model_only.py: -------------------------------------------------------------------------------- 1 | # modes/small_model_only.py 2 | 3 | from pprint import pprint 4 | import os 5 | import datetime 6 | import time 7 | import uuid 8 | 9 | benchfile = "specR_small.csv" 10 | 11 | def run_smallmodel_flow( 12 | question, 13 | small_model, 14 | small_model_port, 15 | generate_text_vllm, 16 | terminating_string: str, 17 | max_tokens=1024, 18 | temperature=0.6, 19 | test_logging: bool = False, 20 | sequential_scale=0, 21 | token_counter=None 22 | ): 23 | """ 24 | A baseline 'placeholder' flow: we just send a single request to the 25 | *small_model* and return it as a final answer, plus usage data. 26 | """ 27 | usage_data = [] 28 | 29 | model_think_prefix = "\n" 30 | model_think_suffix = "" 31 | 32 | bigmodel_str = "You always use ... to mark parts of the reasoning process that are important." 33 | start_time = time.time() 34 | def _clean(t): 35 | for s in ("<|User|>", "<|Assistant|>", "<|begin▁of▁sentence|>", 36 | "<|end▁of▁sentence|>", ""): 37 | t = t.replace(s, "") 38 | return t 39 | 40 | sequential_iter = 0 # Remove sequential-iter support 41 | if sequential_iter == 0: 42 | big_hint = "" 43 | term_str = "\n Put your final answer within \\boxed{}." 44 | cur = (f"<|begin▁of▁sentence|><|User|>{_clean(question)}\n" 45 | f"{big_hint}{term_str}<|Assistant|>\n\n") 46 | prompt = cur 47 | 48 | # prompt[:len(prompt)//2], 49 | resp_json, latency = generate_text_vllm( 50 | prompt, 51 | port=small_model_port, 52 | temperature=temperature, 53 | max_tokens=8192, 54 | model=small_model 55 | ) 56 | final_reply = resp_json["choices"][0]["text"] 57 | final_reply_small = f"{final_reply}" 58 | total_time = time.time() - start_time 59 | total_tokens = token_counter(final_reply_small) if token_counter else len(final_reply_small.split()) 60 | time_per_tok = total_time / total_tokens if total_tokens > 0 else 0 61 | uuid_ = str(uuid.uuid4()) 62 | current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 63 | 64 | try: 65 | if not os.path.exists(benchfile): 66 | with open(benchfile, "w") as f: 67 | f.write( 68 | "uuid,small_model,sequential_scale,total_tokens," 69 | "total_time,time_per_tok,datetime\n" 70 | ) 71 | with open(benchfile, "a") as f: 72 | f.write( 73 | f"{uuid_},{small_model},{sequential_scale}," 74 | f"{total_tokens},{total_time},{time_per_tok},{current_time}\n" 75 | ) 76 | except Exception as e: 77 | print(f"Error writing to file: {e}") 78 | print("Please check if the file path is correct and if you have write permissions.") 79 | pass 80 | return final_reply_small, usage_data 81 | -------------------------------------------------------------------------------- /training/trl/.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: "\U0001F41B Bug Report" 2 | description: Submit a bug report to help us improve TRL 3 | labels: [ "bug" ] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this bug report! 🤗 9 | 10 | 🚩 If it is your first time submitting, be sure to check our [bug report guidelines](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md#did-you-find-a-bug) 11 | 12 | - type: textarea 13 | id: reproduction 14 | validations: 15 | required: true 16 | attributes: 17 | label: Reproduction 18 | description: | 19 | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. 20 | If you have code snippets, error messages, stack traces please provide them here as well. 21 | Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 22 | Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code. 23 | 24 | value: | 25 | ```python 26 | from trl import ... 27 | 28 | ``` 29 | 30 | outputs: 31 | 32 | ``` 33 | Traceback (most recent call last): 34 | File "example.py", line 42, in 35 | ... 36 | ``` 37 | 38 | - type: textarea 39 | id: system-info 40 | attributes: 41 | label: System Info 42 | description: | 43 | Please provide information about your system: platform, Python version, PyTorch version, Transformers version, devices, TRL version, ... 44 | You can get this information by running `trl env` in your terminal. 45 | 46 | placeholder: Copy-paste the output of `trl env` 47 | validations: 48 | required: true 49 | 50 | - type: checkboxes 51 | id: terms 52 | attributes: 53 | label: Checklist 54 | description: | 55 | Before submitting, please confirm that you've completed each of the following. 56 | If an item doesn't apply to your issue, check it anyway to show you've reviewed it. 57 | options: 58 | - label: "I have checked that my issue isn't already filed (see [open issues](https://github.com/huggingface/trl/issues?q=is%3Aissue))" 59 | required: true 60 | - label: "I have included my system information" 61 | required: true 62 | - label: "Any code provided is minimal, complete, and reproducible ([more on MREs](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks))" 63 | required: true 64 | - label: "Any code provided is properly formatted in code blocks, (no screenshot, [more on code blocks](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks))" 65 | required: true 66 | - label: "Any traceback provided is complete" 67 | required: true 68 | -------------------------------------------------------------------------------- /training/trl/tests/test_modeling_geometric_mixture_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | import torch 18 | from transformers import AutoModelForCausalLM, GenerationConfig 19 | 20 | from trl.models.modeling_base import GeometricMixtureWrapper, create_reference_model 21 | 22 | 23 | class TestGeometricMixtureWrapper(unittest.TestCase): 24 | def setUp(self): 25 | model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" 26 | self.model = AutoModelForCausalLM.from_pretrained(model_id) 27 | self.ref_model = create_reference_model(self.model) 28 | self.generation_config = GenerationConfig.from_pretrained(model_id) 29 | self.mixture_coef = 0.5 30 | self.wrapper = GeometricMixtureWrapper( 31 | self.model, self.ref_model, self.generation_config, mixture_coef=self.mixture_coef 32 | ) 33 | 34 | def test_forward(self): 35 | input_ids = torch.tensor([[1, 2, 3, 4, 5]]) 36 | attention_mask = torch.ones_like(input_ids) 37 | 38 | output = self.wrapper(input_ids=input_ids, attention_mask=attention_mask) 39 | 40 | self.assertIsNotNone(output) 41 | self.assertTrue(hasattr(output, "logits")) 42 | self.assertEqual(output.logits.shape, (1, 5, self.model.config.vocab_size)) 43 | 44 | def test_mixture_coefficient(self): 45 | input_ids = torch.tensor([[1, 2, 3, 4, 5]]) 46 | attention_mask = torch.ones_like(input_ids) 47 | 48 | with torch.no_grad(): 49 | model_output = self.model(input_ids=input_ids, attention_mask=attention_mask) 50 | ref_model_output = self.ref_model(input_ids=input_ids, attention_mask=attention_mask) 51 | wrapper_output = self.wrapper(input_ids=input_ids, attention_mask=attention_mask) 52 | 53 | expected_logits = torch.nn.functional.log_softmax( 54 | self.mixture_coef * ref_model_output.logits + (1 - self.mixture_coef) * model_output.logits, dim=-1 55 | ) 56 | 57 | self.assertTrue(torch.allclose(wrapper_output.logits, expected_logits, atol=1e-5)) 58 | 59 | def test_prepare_inputs_for_generation(self): 60 | input_ids = torch.tensor([[1, 2, 3, 4, 5]]) 61 | attention_mask = torch.ones_like(input_ids) 62 | 63 | inputs = self.wrapper.prepare_inputs_for_generation(input_ids, attention_mask=attention_mask, use_cache=True) 64 | 65 | self.assertIn("input_ids", inputs) 66 | self.assertIn("attention_mask", inputs) 67 | self.assertFalse(inputs.get("use_cache", False)) 68 | -------------------------------------------------------------------------------- /training/trl/docs/source/sentiment_tuning.md: -------------------------------------------------------------------------------- 1 | # Sentiment Tuning Examples 2 | 3 | The notebooks and scripts in this examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`). 4 | 5 | Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples): 6 | 7 | 8 | 9 | | File | Description | 10 | |------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------| 11 | | [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset | 12 | | [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. | 13 | | [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. 14 | 15 | 16 | 17 | ## Usage 18 | 19 | ```bash 20 | # 1. run directly 21 | python examples/scripts/ppo.py 22 | # 2. run via `accelerate` (recommended), enabling more features (e.g., multiple GPUs, deepspeed) 23 | accelerate config # will prompt you to define the training configuration 24 | accelerate launch examples/scripts/ppo.py # launches training 25 | # 3. get help text and documentation 26 | python examples/scripts/ppo.py --help 27 | # 4. configure logging with wandb and, say, mini_batch_size=1 and gradient_accumulation_steps=16 28 | python examples/scripts/ppo.py --log_with wandb --mini_batch_size 1 --gradient_accumulation_steps 16 29 | ``` 30 | 31 | Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking). 32 | 33 | 34 | ## Few notes on multi-GPU 35 | 36 | To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`. -------------------------------------------------------------------------------- /training/trl/docs/source/best_of_n.md: -------------------------------------------------------------------------------- 1 | # Best of N sampling: Alternative ways to get better model output without RL based fine-tuning 2 | 3 | Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output. 4 | As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example 5 | 6 | ## Usage 7 | 8 | To get started quickly, instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a proxy reward pipeline that outputs reward scores for input queries 9 | 10 | ```python 11 | 12 | from transformers import pipeline, AutoTokenizer 13 | from trl import AutoModelForCausalLMWithValueHead 14 | from trl.core import LengthSampler 15 | from trl.extras import BestOfNSampler 16 | 17 | ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name) 18 | reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device) 19 | tokenizer = AutoTokenizer.from_pretrained(ref_model_name) 20 | tokenizer.pad_token = tokenizer.eos_token 21 | 22 | 23 | # callable that takes a list of raw text and returns a list of corresponding reward scores 24 | def queries_to_scores(list_of_strings): 25 | return [output["score"] for output in reward_pipe(list_of_strings)] 26 | 27 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler) 28 | 29 | 30 | ``` 31 | 32 | And assuming you have a list/tensor of tokenized queries, you can generate better output by calling the `generate` method 33 | 34 | ```python 35 | 36 | best_of_n.generate(query_tensors, device=device, **gen_kwargs) 37 | 38 | ``` 39 | The default sample size is 4, but you can change it at the time of instance initialization like so 40 | 41 | ```python 42 | 43 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, sample_size=8) 44 | 45 | ``` 46 | 47 | The default output is the result of taking the top scored output for each query, but you can change it to top 2 and so on by passing the `n_candidates` argument at the time of instance initialization 48 | 49 | ```python 50 | 51 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, n_candidates=2) 52 | 53 | ``` 54 | 55 | There is the option of setting the generation settings (like `temperature`, `pad_token_id`) at the time of instance creation as opposed to when calling the `generate` method. 56 | This is done by passing a `GenerationConfig` from the `transformers` library at the time of initialization 57 | 58 | ```python 59 | 60 | from transformers import GenerationConfig 61 | 62 | generation_config = GenerationConfig(min_length= -1, top_k=0.0, top_p= 1.0, do_sample= True, pad_token_id=tokenizer.eos_token_id) 63 | 64 | best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, generation_config=generation_config) 65 | 66 | best_of_n.generate(query_tensors, device=device) 67 | 68 | ``` 69 | 70 | Furthermore, at the time of initialization you can set the seed to control the repeatability of the generation process and the number of samples to generate for each query 71 | 72 | 73 | -------------------------------------------------------------------------------- /training/trl/.github/workflows/slow-tests.yml: -------------------------------------------------------------------------------- 1 | name: Slow tests (on push) 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | # Run only when python files are modified 8 | - "trl/**.py" 9 | - "examples/**.py" 10 | env: 11 | RUN_SLOW: "yes" 12 | IS_GITHUB_CI: "1" 13 | SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} 14 | 15 | 16 | jobs: 17 | run_all_tests_single_gpu: 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"] 22 | runs-on: 23 | group: aws-g4dn-2xlarge 24 | env: 25 | CUDA_VISIBLE_DEVICES: "0" 26 | TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}" 27 | container: 28 | image: ${{ matrix.docker-image-name }} 29 | options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true 30 | defaults: 31 | run: 32 | shell: bash 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Pip install 36 | run: | 37 | source activate trl 38 | pip install -e ".[test]" --no-deps 39 | pip install pytest-reportlog parameterized 40 | 41 | - name: Run slow SFT tests on single GPU 42 | if: always() 43 | run: | 44 | source activate trl 45 | make slow_tests 46 | 47 | - name: Generate Report 48 | if: always() 49 | run: | 50 | pip install slack_sdk tabulate 51 | python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY 52 | 53 | 54 | run_all_tests_multi_gpu: 55 | strategy: 56 | fail-fast: false 57 | matrix: 58 | docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"] 59 | runs-on: 60 | group: aws-g4dn-2xlarge 61 | env: 62 | CUDA_VISIBLE_DEVICES: "0,1" 63 | TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}" 64 | container: 65 | image: ${{ matrix.docker-image-name }} 66 | options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true 67 | defaults: 68 | run: 69 | shell: bash 70 | steps: 71 | - uses: actions/checkout@v4 72 | - name: Pip install 73 | run: | 74 | source activate trl 75 | pip install -e ".[test]" --no-deps 76 | pip install pytest-reportlog parameterized 77 | 78 | - name: Run slow SFT tests on Multi GPU 79 | if: always() 80 | run: | 81 | source activate trl 82 | make slow_tests 83 | 84 | - name: Run end-to-end examples tests on multi GPU 85 | if: always() 86 | run: | 87 | source activate trl 88 | pip install deepspeed 89 | make test_examples 90 | 91 | - name: Generate Reports 92 | if: always() 93 | run: | 94 | pip install slack_sdk tabulate 95 | python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY 96 | python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY 97 | python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY 98 | rm *.txt 99 | -------------------------------------------------------------------------------- /training/trl/trl/extras/profiling.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import contextlib 16 | import functools 17 | import time 18 | from typing import Generator 19 | 20 | from transformers import Trainer, is_wandb_available 21 | 22 | 23 | if is_wandb_available(): 24 | import wandb 25 | 26 | 27 | @contextlib.contextmanager 28 | def profiling_context(trainer: Trainer, name: str) -> Generator[None, None, None]: 29 | """ 30 | A context manager function for profiling a block of code. Results are logged to Weights & Biases if enabled. 31 | 32 | Args: 33 | trainer (`~transformers.Trainer`): 34 | Trainer object. 35 | name (`str`): 36 | Name of the block to be profiled. Used as a key in the logged dictionary. 37 | 38 | Example: 39 | ```python 40 | from transformers import Trainer 41 | from trl.extras.profiling import profiling_context 42 | 43 | class MyTrainer(Trainer): 44 | def some_method(self): 45 | A = np.random.rand(1000, 1000) 46 | B = np.random.rand(1000, 1000) 47 | with profiling_context(self, "matrix_multiplication"): 48 | # Code to profile: simulate a computationally expensive operation 49 | result = A @ B # Matrix multiplication 50 | ``` 51 | """ 52 | start_time = time.perf_counter() 53 | yield 54 | end_time = time.perf_counter() 55 | duration = end_time - start_time 56 | 57 | if "wandb" in trainer.args.report_to and wandb.run is not None and trainer.accelerator.is_main_process: 58 | wandb.log({f"profiling/Time taken: {trainer.__class__.__name__}.{name}": duration}) 59 | 60 | 61 | def profiling_decorator(func: callable) -> callable: 62 | """ 63 | Decorator to profile a function and log execution time using [`extras.profiling.profiling_context`]. 64 | 65 | Args: 66 | func (`callable`): 67 | Function to be profiled. 68 | 69 | Example: 70 | ```python 71 | from transformers import Trainer 72 | from trl.extras.profiling import profiling_decorator 73 | 74 | class MyTrainer(Trainer): 75 | @profiling_decorator 76 | def some_method(self): 77 | A = np.random.rand(1000, 1000) 78 | B = np.random.rand(1000, 1000) 79 | # Code to profile: simulate a computationally expensive operation 80 | result = A @ B 81 | ``` 82 | """ 83 | 84 | @functools.wraps(func) 85 | def wrapper(self, *args, **kwargs): 86 | with profiling_context(self, func.__name__): 87 | return func(self, *args, **kwargs) 88 | 89 | return wrapper 90 | -------------------------------------------------------------------------------- /training/open-r1/recipes/DeepSeek-R1-Distill-Qwen-1.5B/sft/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" 9 | dataset_name: akhauriyash/OpenR1_Math_SpeculativeReasoning 10 | dataset_num_proc: 48 11 | 12 | # SFT trainer config 13 | bf16: true 14 | do_eval: false 15 | eval_strategy: 'no' 16 | gradient_accumulation_steps: 1 17 | gradient_checkpointing: true 18 | gradient_checkpointing_kwargs: 19 | use_reentrant: false 20 | hub_model_id: DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoner 21 | hub_strategy: every_save 22 | learning_rate: 5.0e-05 23 | log_level: info 24 | logging_steps: 1 25 | logging_strategy: steps 26 | lr_scheduler_type: cosine_with_min_lr 27 | lr_scheduler_kwargs: 28 | min_lr_rate: 0.1 29 | packing: true 30 | max_length: 16384 31 | max_steps: -1 32 | num_train_epochs: 3 33 | output_dir: data/DeepSeek-R1-Distill-Qwen-1.5B-SpeculativeReasoner 34 | overwrite_output_dir: true 35 | per_device_eval_batch_size: 8 36 | per_device_train_batch_size: 8 37 | push_to_hub: true 38 | report_to: 39 | - wandb 40 | save_strategy: "steps" 41 | save_steps: 50 42 | save_total_limit: 1 43 | seed: 42 44 | use_liger: true 45 | use_liger_kernel: true 46 | warmup_ratio: 0.05 47 | -------------------------------------------------------------------------------- /training/trl/.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Build Docker images (scheduled) 2 | 3 | on: 4 | workflow_dispatch: 5 | workflow_call: 6 | schedule: 7 | - cron: "0 1 * * *" 8 | 9 | concurrency: 10 | group: docker-image-builds 11 | cancel-in-progress: false 12 | 13 | env: 14 | CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }} 15 | 16 | jobs: 17 | trl-latest: 18 | name: "Latest TRL GPU" 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Cleanup disk 22 | run: | 23 | sudo ls -l /usr/local/lib/ 24 | sudo ls -l /usr/share/ 25 | sudo du -sh /usr/local/lib/ 26 | sudo du -sh /usr/share/ 27 | sudo rm -rf /usr/local/lib/android 28 | sudo rm -rf /usr/share/dotnet 29 | sudo du -sh /usr/local/lib/ 30 | sudo du -sh /usr/share/ 31 | - name: Set up Docker Buildx 32 | uses: docker/setup-buildx-action@v1 33 | - name: Check out code 34 | uses: actions/checkout@v4 35 | - name: Login to DockerHub 36 | uses: docker/login-action@v1 37 | with: 38 | username: ${{ secrets.DOCKERHUB_USERNAME }} 39 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 40 | 41 | - name: Build and Push GPU 42 | uses: docker/build-push-action@v4 43 | with: 44 | context: ./docker/trl-latest-gpu 45 | push: true 46 | tags: huggingface/trl-latest-gpu 47 | 48 | - name: Post to Slack 49 | if: always() 50 | uses: huggingface/hf-workflows/.github/actions/post-slack@main 51 | with: 52 | slack_channel: ${{ env.CI_SLACK_CHANNEL }} 53 | title: 🤗 Results of the trl-latest-gpu Docker Image build 54 | status: ${{ job.status }} 55 | slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} 56 | 57 | trl-source: 58 | name: "Latest TRL + HF ecosystem from source" 59 | runs-on: ubuntu-latest 60 | steps: 61 | - name: Cleanup disk 62 | run: | 63 | sudo ls -l /usr/local/lib/ 64 | sudo ls -l /usr/share/ 65 | sudo du -sh /usr/local/lib/ 66 | sudo du -sh /usr/share/ 67 | sudo rm -rf /usr/local/lib/android 68 | sudo rm -rf /usr/share/dotnet 69 | sudo du -sh /usr/local/lib/ 70 | sudo du -sh /usr/share/ 71 | - name: Set up Docker Buildx 72 | uses: docker/setup-buildx-action@v1 73 | - name: Check out code 74 | uses: actions/checkout@v4 75 | - name: Login to DockerHub 76 | uses: docker/login-action@v1 77 | with: 78 | username: ${{ secrets.DOCKERHUB_USERNAME }} 79 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 80 | 81 | - name: Build and Push GPU 82 | uses: docker/build-push-action@v4 83 | with: 84 | context: ./docker/trl-source-gpu 85 | push: true 86 | tags: huggingface/trl-source-gpu 87 | 88 | - name: Post to Slack 89 | if: always() 90 | uses: huggingface/hf-workflows/.github/actions/post-slack@main 91 | with: 92 | slack_channel: ${{ env.CI_SLACK_CHANNEL }} 93 | title: 🤗 Results of the trl-source-gpu Docker Image build 94 | status: ${{ job.status }} 95 | slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} 96 | -------------------------------------------------------------------------------- /training/trl/docs/source/_toctree.yml: -------------------------------------------------------------------------------- 1 | - sections: 2 | - local: index 3 | title: TRL 4 | - local: installation 5 | title: Installation 6 | - local: quickstart 7 | title: Quickstart 8 | title: Getting started 9 | - sections: 10 | - local: dataset_formats 11 | title: Dataset Formats 12 | - local: how_to_train 13 | title: Training FAQ 14 | - local: logging 15 | title: Understanding Logs 16 | title: Conceptual Guides 17 | - sections: 18 | - local: clis 19 | title: Command Line Interface (CLI) 20 | - local: customization 21 | title: Customizing the Training 22 | - local: reducing_memory_usage 23 | title: Reducing Memory Usage 24 | - local: speeding_up_training 25 | title: Speeding Up Training 26 | - local: distributing_training 27 | title: Distributing Training 28 | - local: use_model 29 | title: Using Trained Models 30 | title: How-to guides 31 | - sections: 32 | - local: deepspeed_integration 33 | title: DeepSpeed 34 | - local: liger_kernel_integration 35 | title: Liger Kernel 36 | - local: peft_integration 37 | title: PEFT 38 | - local: unsloth_integration 39 | title: Unsloth 40 | title: Integrations 41 | - sections: 42 | - local: example_overview 43 | title: Example Overview 44 | - local: community_tutorials 45 | title: Community Tutorials 46 | - local: sentiment_tuning 47 | title: Sentiment Tuning 48 | - local: using_llama_models 49 | title: Training StackLlama 50 | - local: detoxifying_a_lm 51 | title: Detoxifying a Language Model 52 | - local: learning_tools 53 | title: Learning to Use Tools 54 | - local: multi_adapter_rl 55 | title: Multi Adapter RLHF 56 | title: Examples 57 | - sections: 58 | - sections: # Sorted alphabetically 59 | - local: alignprop_trainer 60 | title: AlignProp 61 | - local: bco_trainer 62 | title: BCO 63 | - local: cpo_trainer 64 | title: CPO 65 | - local: ddpo_trainer 66 | title: DDPO 67 | - local: dpo_trainer 68 | title: DPO 69 | - local: online_dpo_trainer 70 | title: Online DPO 71 | - local: gkd_trainer 72 | title: GKD 73 | - local: grpo_trainer 74 | title: GRPO 75 | - local: kto_trainer 76 | title: KTO 77 | - local: nash_md_trainer 78 | title: Nash-MD 79 | - local: orpo_trainer 80 | title: ORPO 81 | - local: ppo_trainer 82 | title: PPO 83 | - local: prm_trainer 84 | title: PRM 85 | - local: reward_trainer 86 | title: Reward 87 | - local: rloo_trainer 88 | title: RLOO 89 | - local: sft_trainer 90 | title: SFT 91 | - local: iterative_sft_trainer 92 | title: Iterative SFT 93 | - local: xpo_trainer 94 | title: XPO 95 | title: Trainers 96 | - local: models 97 | title: Model Classes 98 | - local: best_of_n 99 | title: Best of N Sampling 100 | - local: judges 101 | title: Judges 102 | - local: callbacks 103 | title: Callbacks 104 | - local: data_utils 105 | title: Data Utilities 106 | - local: text_environments 107 | title: Text Environments 108 | - local: script_utils 109 | title: Script Utilities 110 | - local: others 111 | title: Others 112 | title: API 113 | -------------------------------------------------------------------------------- /training/open-r1/slurm/train.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=open-r1-sft 3 | #SBATCH --ntasks-per-node=1 4 | #SBATCH --exclusive 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition=hopper-prod # Adjust this for your cluster 7 | #SBATCH --output=./logs/%x-%j.out 8 | #SBATCH --err=./logs/%x-%j.err 9 | #SBATCH --requeue 10 | 11 | # Specific configuration optimized for the Hugging Face Compute Cluster 12 | module load cuda/12.4 13 | set -x -e 14 | 15 | source ~/.bashrc 16 | source openr1/bin/activate 17 | echo "START TIME: $(date)" 18 | 19 | MODEL=$1 20 | TASK=$2 21 | CONFIG_SUFFIX=$3 22 | ACCELERATOR=$4 23 | OPTIONAL_ARGS=$5 24 | CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml 25 | GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}') 26 | MODEL=$(grep 'model_name_or_path:' $CONFIG_FILE | awk '{print $2}') 27 | REVISION=$(grep 'model_revision:' $CONFIG_FILE | head -n 1 | awk '{print $2}') 28 | 29 | # Distributed configuration 30 | NUM_NODES=$SLURM_NNODES 31 | GPUS_PER_NODE=8 32 | WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE)) 33 | NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 34 | MASTER_ADDR=${NODELIST[0]} # First node for main process 35 | MASTER_PORT=6000 36 | TRAIN_NODES=("${NODELIST[@]}") 37 | 38 | USE_VLLM="false" 39 | if [[ -f "$CONFIG_FILE" ]] && grep -qE '^\s*use_vllm:\s*true' "$CONFIG_FILE"; then 40 | USE_VLLM="true" 41 | fi 42 | # if using vllm 43 | if [[ "$USE_VLLM" == "true" ]]; then 44 | TRAIN_NODES=("${NODELIST[@]:0:$((NUM_NODES - 1))}") 45 | VLLM_NODE=${NODELIST[-1]} # Last node 46 | TP=$(python scripts/get_tensor_parallel_size.py --model_name $MODEL --revision $REVISION --default_tp $GPUS_PER_NODE) 47 | WORLD_SIZE=$((WORLD_SIZE - GPUS_PER_NODE)) 48 | NUM_NODES=$((NUM_NODES - 1)) 49 | srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP & 50 | 51 | OPTIONAL_ARGS="$OPTIONAL_ARGS --vllm_server_host=$VLLM_NODE" 52 | fi 53 | 54 | # force crashing on nccl issues like hanging broadcast 55 | export NCCL_ASYNC_ERROR_HANDLING=1 56 | # export NCCL_DEBUG=INFO 57 | # export NCCL_DEBUG_SUBSYS=COLL 58 | # export NCCL_SOCKET_NTHREADS=1 59 | # export NCCL_NSOCKS_PERTHREAD=1 60 | # export CUDA_LAUNCH_BLOCKING=1 61 | 62 | export CMD=" \ 63 | src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS 64 | " 65 | 66 | export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \ 67 | --config_file recipes/accelerate_configs/$ACCELERATOR.yaml \ 68 | --gradient_accumulation_steps $GRAD_ACC_STEPS \ 69 | --num_machines $NUM_NODES \ 70 | --num_processes $WORLD_SIZE \ 71 | --main_process_ip $MASTER_ADDR \ 72 | --main_process_port $MASTER_PORT \ 73 | --machine_rank $SLURM_PROCID \ 74 | --rdzv_backend=c10d \ 75 | --max_restarts 1 \ 76 | --role \$(hostname -s): \ 77 | --tee 3 \ 78 | " 79 | # srun error handling: 80 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks 81 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code 82 | SRUN_ARGS=" \ 83 | --wait=60 \ 84 | --kill-on-bad-exit=1 \ 85 | --nodes=$NUM_NODES \ 86 | --ntasks=$NUM_NODES \ 87 | --nodelist=$TRAIN_NODES 88 | " 89 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1 90 | 91 | echo "END TIME: $(date)" --------------------------------------------------------------------------------