├── R1-V
    ├── .gitignore
    ├── README.md
    ├── images
    │   ├── counting_star.png
    │   ├── curves.png
    │   ├── grpo_trainer_log.jpg
    │   ├── ood.png
    │   ├── super_ood.png
    │   ├── train-test.png
    │   ├── training.png
    │   └── vllm_grpo_trainer_modified_log.jpg
    ├── requirements.txt
    ├── setup.sh
    ├── src
    │   ├── distill_r1
    │   │   ├── README.md
    │   │   ├── create_hf_dataset.py
    │   │   ├── filter_r1.py
    │   │   ├── generate_scene_qa_pairs.ipynb
    │   │   ├── grpo_r1_distilled.jpg
    │   │   ├── prompt.py
    │   │   └── query_r1.py
    │   ├── eval
    │   │   ├── logs
    │   │   │   ├── counting_results_superclevr_200_qwen2vl_2b_instruct_grpo100_legacy.json
    │   │   │   ├── counting_results_superclevr_200_qwen2vl_2b_instruct_legacy.json
    │   │   │   ├── geoqa_test_qwen2vl_7b_grpo_2epochs_legacy.json
    │   │   │   └── geoqa_test_qwen2vl_7b_instruct_legacy.json
    │   │   ├── prompts
    │   │   │   ├── geoqa_test_prompts.jsonl
    │   │   │   └── superclevr_test200_counting_problems.jsonl
    │   │   ├── test_qwen2vl_counting_superclevr.py
    │   │   ├── test_qwen2vl_geoqa.py
    │   │   └── test_qwen2vl_geoqa_multigpu.py
    │   ├── r1-v
    │   │   ├── .gitignore
    │   │   ├── LICENSE
    │   │   ├── Makefile
    │   │   ├── configs
    │   │   │   ├── ddp.yaml
    │   │   │   ├── qwen2vl_sft_config.yaml
    │   │   │   ├── zero2.yaml
    │   │   │   └── zero3.yaml
    │   │   ├── local_scripts
    │   │   │   ├── create_vision_cot_data.py
    │   │   │   ├── lmms_eval_qwen2vl.sh
    │   │   │   ├── prepare_hf_data.py
    │   │   │   ├── train_aria_moe.sh
    │   │   │   ├── train_qwen2_vl.sh
    │   │   │   ├── zero1_no_optimizer.json
    │   │   │   ├── zero2.json
    │   │   │   ├── zero3.json
    │   │   │   ├── zero3.yaml
    │   │   │   └── zero3_offload.json
    │   │   ├── run_grpo.sh
    │   │   ├── setup.cfg
    │   │   ├── setup.py
    │   │   ├── src
    │   │   │   └── open_r1
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── evaluate.py
    │   │   │   │   ├── generate.py
    │   │   │   │   ├── grpo.py
    │   │   │   │   ├── sft.py
    │   │   │   │   └── trainer
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── grpo_trainer.py
    │   │   │   │       ├── vllm_grpo_trainer.py
    │   │   │   │       └── vllm_grpo_trainer_modified.py
    │   │   └── temp_image.png
    │   ├── requirements.txt
    │   └── scripts
    │   │   └── run_grpo_GEOQA_qwen2.5_3b.sh
    └── test.sh
├── README.md
├── Visual-RFT
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── assets
    │   ├── case_cls.png
    │   ├── case_lisa.png
    │   ├── framework.png
    │   ├── pokeymon.jpg
    │   ├── radar.png
    │   └── teaser.png
    ├── classification
    │   ├── Qwen2_VL_classification_infere.py
    │   └── val_data
    │   │   ├── fgvc_aircraft.pth
    │   │   ├── fgvc_aircraft.txt
    │   │   ├── oxford_flowers.pth
    │   │   ├── oxford_flowers.txt
    │   │   ├── pets.pth
    │   │   ├── pets.txt
    │   │   ├── stanford_cars.pth
    │   │   └── stanford_cars.txt
    ├── coco_evaluation
    │   ├── Qwen2_VL_coco_infere.py
    │   ├── coco_evaluation.py
    │   ├── evaluation.ipynb
    │   ├── exist_map_coco_Qwen2_vl_2B_baseline.json
    │   └── exist_map_coco_Qwen2_vl_7B_baseline.json
    ├── dataset
    │   ├── README.md
    │   └── build_dataset.ipynb
    ├── demo
    │   ├── README.md
    │   └── lisa_demo.ipynb
    ├── lisa_evaluation
    │   ├── Qwen2_VL_lisa_infere.py
    │   ├── Qwen2_VL_lisa_infere.sh
    │   ├── README.md
    │   ├── box2mask.py
    │   ├── evaluation.ipynb
    │   ├── gen_box_ann.py
    │   ├── gen_sft.py
    │   ├── mask_iou.py
    │   └── merge_eval.py
    ├── lvis_evaluation
    │   ├── Qwen2_VL_lvis_infere.py
    │   ├── exist_map_lvis_Qwen2_vl_2B_baseline.json
    │   ├── exist_map_lvis_Qwen2_vl_7B_baseline.json
    │   └── lvis_evaluation.ipynb
    ├── q&a.md
    ├── requirements.txt
    ├── setup.sh
    ├── src
    │   ├── scripts
    │   │   ├── 2B_aircraft_4_shot.sh
    │   │   ├── 2B_car196_4_shot.sh
    │   │   ├── 2B_flower_4_shot.sh
    │   │   ├── 2B_lisa_grounding.sh
    │   │   └── 2B_pets37_4_shot.sh
    │   └── virft
    │   │   ├── .gitignore
    │   │   ├── LICENSE
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── configs
    │   │       ├── ddp.yaml
    │   │       ├── zero2.yaml
    │   │       └── zero3.yaml
    │   │   ├── local_scripts
    │   │       ├── create_vision_cot_data.py
    │   │       ├── lmms_eval_qwen2vl.sh
    │   │       ├── prepare_hf_data.py
    │   │       ├── train_aria_moe.sh
    │   │       ├── train_qwen2_vl.sh
    │   │       ├── zero2.json
    │   │       ├── zero3.json
    │   │       ├── zero3.yaml
    │   │       └── zero3_offload.json
    │   │   ├── setup.cfg
    │   │   ├── setup.py
    │   │   ├── slurm
    │   │       ├── evaluate.slurm
    │   │       ├── generate.slurm
    │   │       └── sft.slurm
    │   │   └── src
    │   │       └── open_r1
    │   │           ├── __init__.py
    │   │           ├── evaluate.py
    │   │           ├── generate.py
    │   │           ├── grpo.py
    │   │           ├── grpo_classification.py
    │   │           ├── grpo_lisa.py
    │   │           ├── sft.py
    │   │           └── trainer
    │   │               ├── __init__.py
    │   │               ├── grpo_trainer.py
    │   │               └── vllm_grpo_trainer.py
    └── test.sh
├── VisualThinker-R1-Zero
    ├── .gitignore
    ├── README.md
    ├── requirements.txt
    ├── setup.sh
    ├── src
    │   ├── data
    │   │   └── SAT
    │   │   │   ├── prepare_dataset.sh
    │   │   │   └── process_dataset.py
    │   ├── eval
    │   │   ├── evaluate_Qwen2_VL_CVBench-base.py
    │   │   └── evaluate_Qwen2_VL_CVBench.py
    │   └── open-r1-multimodal
    │   │   ├── LICENSE
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── configs
    │   │       ├── ddp.yaml
    │   │       ├── zero2.yaml
    │   │       └── zero3.yaml
    │   │   ├── prepare_2B_base.sh
    │   │   ├── run_grpo.sh
    │   │   ├── run_grpo_SAT.sh
    │   │   ├── run_sft.sh
    │   │   ├── run_sft_SAT.sh
    │   │   ├── setup.cfg
    │   │   ├── setup.py
    │   │   ├── src
    │   │       └── open_r1
    │   │       │   ├── __init__.py
    │   │       │   ├── evaluate.py
    │   │       │   ├── generate.py
    │   │       │   ├── grpo.py
    │   │       │   ├── sft.py
    │   │       │   └── trainer
    │   │       │       ├── InternVL2.py
    │   │       │       ├── __init__.py
    │   │       │       └── grpo_trainer.py
    │   │   └── test.py
    └── test.sh
├── docs
    └── images
    │   ├── GPG.png
    │   └── figure0.svg
├── open-r1
    ├── .github
    │   ├── dependabot.yml
    │   └── workflows
    │   │   └── tests.yml
    ├── .gitignore
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── assets
    │   └── plan-of-attack.png
    ├── recipes
    │   ├── DeepSeek-R1-Distill-Qwen-1.5B
    │   │   └── grpo
    │   │   │   ├── config_demo.yaml
    │   │   │   ├── config_demo_v1.yaml
    │   │   │   └── config_demo_v2.yaml
    │   ├── Mistral-Small-24B-Instruct-2501
    │   │   └── sft
    │   │   │   └── config_openr1_math.yaml
    │   ├── OlympicCoder-32B
    │   │   └── sft
    │   │   │   └── config_v00.00.yaml
    │   ├── OlympicCoder-7B
    │   │   └── sft
    │   │   │   └── config_v00.00.yaml
    │   ├── OpenR1-Qwen-7B
    │   │   ├── gpg
    │   │   │   ├── config_v0.yaml
    │   │   │   ├── config_v0_ds.yaml
    │   │   │   ├── config_v1.yaml
    │   │   │   ├── config_v1_ds.yaml
    │   │   │   └── config_v2_ds.yaml
    │   │   └── sft
    │   │   │   ├── config.yaml
    │   │   │   ├── config_v0.yaml
    │   │   │   ├── config_v1.yaml
    │   │   │   └── config_v2.yaml
    │   ├── Qwen2.5-1.5B-Instruct
    │   │   ├── gpg
    │   │   │   ├── config_demo_v1.yaml
    │   │   │   ├── config_demo_v2.yaml
    │   │   │   ├── config_demo_v3.yaml
    │   │   │   ├── config_demo_v4.yaml
    │   │   │   ├── config_demo_v5.yaml
    │   │   │   ├── config_demo_woSTD.yaml
    │   │   │   ├── config_v0.yaml
    │   │   │   ├── config_v0_nostd.yaml
    │   │   │   └── config_v0_open22k.yaml
    │   │   ├── grpo
    │   │   │   ├── config_demo.yaml
    │   │   │   ├── config_demo_code.yaml
    │   │   │   ├── config_demo_code_ioi.yaml
    │   │   │   ├── config_demo_v1.yaml
    │   │   │   └── config_demo_v3.yaml
    │   │   └── sft
    │   │   │   ├── config_demo.yaml
    │   │   │   ├── config_demo_v1.yaml
    │   │   │   └── config_v1.yaml
    │   ├── Qwen2.5-7B-Instruct
    │   │   └── grpo
    │   │   │   ├── config_demo.yaml
    │   │   │   └── config_demo_v1.yaml
    │   ├── Qwen2.5-Math-7B
    │   │   └── grpo
    │   │   │   ├── config_simple_rl.yaml
    │   │   │   ├── config_simple_rl_dgrpo.yaml
    │   │   │   ├── config_simple_rl_gpg.yaml
    │   │   │   ├── config_simple_rl_gpg_3k.yaml
    │   │   │   ├── config_simple_rl_gpg_3k_2nodes.yaml
    │   │   │   ├── config_simple_rl_gpg_n16.yaml
    │   │   │   ├── config_simple_rl_gpg_n16_wostd.yaml
    │   │   │   ├── config_simple_rl_gpg_n2.yaml
    │   │   │   ├── config_simple_rl_gpg_n2_wostd.yaml
    │   │   │   ├── config_simple_rl_gpg_n4.yaml
    │   │   │   ├── config_simple_rl_gpg_n4_wostd.yaml
    │   │   │   ├── config_simple_rl_gpg_n8.yaml
    │   │   │   ├── config_simple_rl_gpg_scale_batch.yaml
    │   │   │   ├── config_simple_rl_math_l35.yaml
    │   │   │   ├── config_simple_rl_math_l35_v1.yaml
    │   │   │   ├── config_simple_rl_math_l35_v2.yaml
    │   │   │   ├── config_simple_rl_math_l35_v2_g16.yaml
    │   │   │   ├── config_simple_rl_v1.yaml
    │   │   │   ├── config_simple_rl_v1_kl.yaml
    │   │   │   ├── config_simple_rl_v1_nostd.yaml
    │   │   │   └── config_simple_rl_wokl.yaml
    │   ├── README.md
    │   ├── SmolLM2-1.7B-Instruct
    │   │   └── sft
    │   │   │   └── config.yaml
    │   ├── SmolLM2-1.7B
    │   │   └── sft
    │   │   │   └── config.yaml
    │   └── accelerate_configs
    │   │   ├── ddp.yaml
    │   │   ├── fsdp.yaml
    │   │   ├── zero1.yaml
    │   │   ├── zero2.yaml
    │   │   └── zero3.yaml
    ├── scripts
    │   ├── decontaminate.py
    │   ├── generate_reasoning.py
    │   ├── get_tensor_parallel_size.py
    │   ├── run_benchmarks.py
    │   └── upload_details.py
    ├── setup.cfg
    ├── setup.py
    ├── slurm
    │   ├── README.md
    │   ├── evaluate.slurm
    │   ├── experimental
    │   │   └── serve_r1_vllm.slurm
    │   ├── generate.slurm
    │   ├── piston
    │   │   ├── README.md
    │   │   ├── launch_piston_workers.sh
    │   │   └── launch_single_piston.sh
    │   ├── serve_r1.slurm
    │   ├── serve_router.slurm
    │   └── train.slurm
    ├── src
    │   └── open_r1
    │   │   ├── __init__.py
    │   │   ├── configs.py
    │   │   ├── evaluate.py
    │   │   ├── evaluate_short.py
    │   │   ├── generate.py
    │   │   ├── gpg.py
    │   │   ├── gpg_trainer.py
    │   │   ├── grpo.py
    │   │   ├── rewards.py
    │   │   ├── sft.py
    │   │   ├── test_dataset.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── callbacks.py
    │   │       ├── data_utils.py
    │   │       ├── evaluation.py
    │   │       ├── hub.py
    │   │       ├── import_utils.py
    │   │       ├── ioi
    │   │           ├── __init__.py
    │   │           ├── piston_client.py
    │   │           ├── scoring.py
    │   │           └── utils.py
    │   │       ├── model_utils.py
    │   │       └── wandb_logging.py
    ├── tests
    │   ├── __init__.py
    │   ├── slow
    │   │   └── test_code_reward.py
    │   ├── test_rewards.py
    │   ├── transformer_ds_qwen_15B_R1.yaml
    │   └── transformer_ds_qwen_15B_R1_retrain.yaml
    └── train.sh
└── open-rs
    ├── README.md
    ├── eval.sh
    ├── recipes
        ├── accelerate_configs
        │   ├── ddp.yaml
        │   ├── fsdp.yaml
        │   ├── zero2.yaml
        │   └── zero3.yaml
        ├── data_cleaner.yaml
        ├── gpg.yaml
        ├── gpg_7B.yaml
        ├── gpg_std.yaml
        ├── grpo.yaml
        ├── grpo_7B.yaml
        ├── grpo_ng.yaml
        └── grpo_wo_vllm.yaml
    ├── setup.cfg
    ├── setup.py
    ├── src
        └── open_r1
        │   ├── __init__.py
        │   ├── configs.py
        │   ├── evaluate.py
        │   ├── generate.py
        │   ├── gpg.py
        │   ├── gpg_std.py
        │   ├── grpo.py
        │   ├── rewards.py
        │   ├── sft.py
        │   ├── trainer
        │       ├── __init__.py
        │       ├── gpg_std_trainer.py
        │       └── gpg_trainer.py
        │   └── utils
        │       ├── __init__.py
        │       ├── callbacks.py
        │       ├── evaluation.py
        │       ├── hub.py
        │       ├── import_utils.py
        │       ├── model_utils.py
        │       └── wandb_logging.py
    └── train.sh


/R1-V/.gitignore:
--------------------------------------------------------------------------------
 1 | output/
 2 | hostfiles/
 3 | internal_scripts/
 4 | data/
 5 | output_onlypg/
 6 | output_grpo/
 7 | Geo170K/
 8 | src/eval/Geo170K/
 9 | src/eval/images/
10 | src/eval/images.zip
11 | 


--------------------------------------------------------------------------------
/R1-V/images/counting_star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/counting_star.png


--------------------------------------------------------------------------------
/R1-V/images/curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/curves.png


--------------------------------------------------------------------------------
/R1-V/images/grpo_trainer_log.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/grpo_trainer_log.jpg


--------------------------------------------------------------------------------
/R1-V/images/ood.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/ood.png


--------------------------------------------------------------------------------
/R1-V/images/super_ood.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/super_ood.png


--------------------------------------------------------------------------------
/R1-V/images/train-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/train-test.png


--------------------------------------------------------------------------------
/R1-V/images/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/training.png


--------------------------------------------------------------------------------
/R1-V/images/vllm_grpo_trainer_modified_log.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/vllm_grpo_trainer_modified_log.jpg


--------------------------------------------------------------------------------
/R1-V/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=1.2.1
 2 | bitsandbytes>=0.43.0
 3 | black>=24.4.2
 4 | datasets>=3.2.0
 5 | deepspeed==0.15.4
 6 | distilabel[vllm,ray,openai]>=1.5.2
 7 | einops>=0.8.0
 8 | flake8>=6.0.0
 9 | hf_transfer>=0.1.4
10 | huggingface-hub[cli]>=0.19.2,<1.0
11 | isort>=5.12.0
12 | liger_kernel==0.5.2
13 | # lighteval @ git+https://githubfast.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math]
14 | math-verify
15 | packaging>=23.0
16 | parameterized>=0.9.0
17 | pytest
18 | safetensors>=0.3.3
19 | sentencepiece>=0.1.99
20 | torch>=2.5.1
21 | transformers @ git+https://githubfast.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef
22 | trl==0.14.0
23 | vllm==0.6.6.post1
24 | wandb>=0.19.1
25 | pillow


--------------------------------------------------------------------------------
/R1-V/setup.sh:
--------------------------------------------------------------------------------
 1 | # Install the packages in r1-v .
 2 | cd src/r1-v 
 3 | pip install -e ".[dev]"
 4 | 
 5 | # Addtional modules
 6 | pip install wandb==0.18.3
 7 | pip install tensorboardx
 8 | pip install qwen_vl_utils torchvision
 9 | pip install flash-attn --no-build-isolation
10 | 
11 | # vLLM support 
12 | pip install vllm==0.7.2
13 | 
14 | # fix transformers version
15 | pip install git+https://github.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef


--------------------------------------------------------------------------------
/R1-V/src/distill_r1/grpo_r1_distilled.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/src/distill_r1/grpo_r1_distilled.jpg


--------------------------------------------------------------------------------
/R1-V/src/distill_r1/prompt.py:
--------------------------------------------------------------------------------
 1 | R1_SYS_PROMPT = """You are DeepSeek-R1, an AI assistant created exclusively by the Chinese Company DeepSeek. You'll provide helpful, harmless, and detailed responses to all user inquiries. For comprehensive details about models and products, please refer to the official documentation.
 2 | 
 3 | Key Guidelines:
 4 | Identity & Compliance
 5 | 
 6 | Clearly state your identity as a DeepSeek AI assistant in initial responses.
 7 | 
 8 | Comply with Chinese laws and regulations, including data privacy requirements.
 9 | 
10 | Capability Scope
11 | 
12 | Handle both Chinese and English queries effectively
13 | 
14 | Acknowledge limitations for real-time information post knowledge cutoff (2023-12)
15 | 
16 | Provide technical explanations for AI-related questions when appropriate
17 | 
18 | Response Quality
19 | 
20 | Give comprehensive, logically structured answers
21 | 
22 | Use markdown formatting for clear information organization
23 | 
24 | Admit uncertainties for ambiguous queries
25 | 
26 | Ethical Operation
27 | 
28 | Strictly refuse requests involving illegal activities, violence, or explicit content
29 | 
30 | Maintain political neutrality according to company guidelines
31 | 
32 | Protect user privacy and avoid data collection
33 | 
34 | Specialized Processing
35 | 
36 | Use <think>...</think> tags for internal reasoning before responding
37 | 
38 | Employ XML-like tags for structured output when required
39 | """


--------------------------------------------------------------------------------
/R1-V/src/r1-v/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style quality
 2 | 
 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 4 | export PYTHONPATH = src
 5 | 
 6 | check_dirs := src
 7 | 
 8 | style:
 9 | 	black --line-length 119 --target-version py310 $(check_dirs) setup.py
10 | 	isort $(check_dirs) setup.py
11 | 
12 | quality:
13 | 	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
14 | 	isort --check-only $(check_dirs) setup.py
15 | 	flake8 --max-line-length 119 $(check_dirs) setup.py
16 | 
17 | 
18 | # Evaluation
19 | 
20 | evaluate:
21 | 


--------------------------------------------------------------------------------
/R1-V/src/r1-v/configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/R1-V/src/r1-v/configs/qwen2vl_sft_config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2-VL-2B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | 
 6 | # Data training arguments
 7 | dataset_name: MMInstruction/Clevr_CoGenT_TrainA_R1
 8 | dataset_configs:
 9 | - all
10 | preprocessing_num_workers: 8
11 | 
12 | # SFT trainer config
13 | bf16: true
14 | do_eval: true
15 | eval_strategy: "no"
16 | gradient_accumulation_steps: 4
17 | gradient_checkpointing: true
18 | gradient_checkpointing_kwargs:
19 |   use_reentrant: false
20 | hub_model_id: Qwen2-VL-2B-Instruct-SFT
21 | hub_strategy: every_save
22 | learning_rate: 2.0e-05
23 | log_level: info
24 | logging_steps: 5
25 | logging_strategy: steps
26 | lr_scheduler_type: cosine
27 | packing: true
28 | max_seq_length: 4096
29 | max_steps: -1
30 | num_train_epochs: 1
31 | output_dir: data/Qwen2-VL-2B-Instruct-SFT
32 | overwrite_output_dir: true
33 | per_device_eval_batch_size: 4
34 | per_device_train_batch_size: 4
35 | push_to_hub: true
36 | report_to:
37 | - wandb
38 | save_strategy: "no"
39 | seed: 42
40 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/R1-V/src/r1-v/configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/R1-V/src/r1-v/configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/lmms_eval_qwen2vl.sh:
--------------------------------------------------------------------------------
 1 | export HF_HOME="<CACHE_DIR>"
 2 | export HF_TOKEN="<HF_TOKEN>"
 3 | export HF_HUB_ENABLE_HF_TRANSFER="1"
 4 | 
 5 | export API_TYPE="<API_TYPE>"
 6 | export AZURE_ENDPOINT="<AZURE_ENDPOINT>"
 7 | export AZURE_API_KEY="<API_KEY>"
 8 | export API_VERSION="<API_VERSION>"
 9 | export MODEL_VERSION="<MODEL_VERSION>"
10 | export NAVIT_ATTENTION_IMPLEMENTATION="eager"
11 | 
12 | # Prompt for installation with 3-second timeout
13 | read -t 3 -p "Do you want to install dependencies? (YES/no, timeout in 3s): " install_deps || true
14 | if [ "$install_deps" = "YES" ]; then
15 |     # Prepare the environment
16 |     pip3 install --upgrade pip
17 |     pip3 install -U setuptools
18 | 
19 |     cd <PROJECT_ROOT>
20 |     if [ ! -d "maas_engine" ]; then
21 |         git clone <REPO_URL>
22 |     else
23 |         echo "maas_engine directory already exists, skipping clone"
24 |     fi
25 |     cd maas_engine
26 |     git pull
27 |     git checkout <BRANCH_NAME>
28 |     pip3 install --no-cache-dir --no-build-isolation -e ".[standalone]"
29 | 
30 |     current_version=$(pip3 show transformers | grep Version | cut -d' ' -f2)
31 |     if [ "$current_version" != "4.46.2" ]; then
32 |         echo "Installing transformers 4.46.2 (current version: $current_version)"
33 |         pip3 install transformers==4.46.2
34 |     else
35 |         echo "transformers 4.46.2 is already installed"
36 |     fi
37 | 
38 |     cd <LMMS_EVAL_DIR>
39 |     rm -rf <TARGET_DIR>
40 |     pip3 install -e .
41 |     pip3 install -U pydantic
42 |     pip3 install Levenshtein
43 |     pip3 install nltk
44 |     python3 -c "import nltk; nltk.download('wordnet', quiet=True); nltk.download('punkt', quiet=True)"
45 | fi
46 | 
47 | TASKS=mmmu_val,mathvista_testmini,mmmu_pro
48 | MODEL_BASENAME=qwen2_vl
49 | 
50 | model_checkpoint="<MODEL_CHECKPOINT_PATH>"
51 | echo "MODEL_BASENAME: ${MODEL_BASENAME}"
52 | cd <LMMS_EVAL_DIR>
53 | 
54 | python3 -m accelerate.commands.launch --num_processes=8 --main_process_port=12345 lmms_eval \
55 |     --model qwen2_vl \
56 |     --model_args=pretrained=${model_checkpoint},max_pixels=2359296 \
57 |     --tasks ${TASKS} \
58 |     --batch_size 1 \
59 |     --log_samples \
60 |     --log_samples_suffix ${MODEL_BASENAME} \
61 |     --output_path ./logs


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/train_qwen2_vl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export NCCL_BLOCKING_WAIT=0
 4 | export TOKENIZERS_PARALLELISM=false
 5 | export OMP_NUM_THREADS=8
 6 | export NCCL_IB_DISABLE=0
 7 | export NCCL_IB_GID_INDEX=3
 8 | export NCCL_SOCKET_IFNAME=eth0
 9 | export NCCL_DEBUG=INFO
10 | 
11 | GPUS="0,1,2,3,4,5,6,7"
12 | 
13 | # 取 worker0 第一个 port
14 | ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' '))
15 | port=${ports[0]}
16 | port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2000}" | awk -F',' '{print $1}')"
17 | 
18 | echo "total workers: ${ARNOLD_WORKER_NUM}"
19 | echo "cur worker id: ${ARNOLD_ID}"
20 | echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
21 | echo "master ip: ${METIS_WORKER_0_HOST}"
22 | echo "master port: ${port}"
23 | echo "master port in cmd: ${port_in_cmd}"
24 | 
25 | # export WANDB_BASE_URL=https://api.wandb.ai
26 | # export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_1>"
27 | # wandb login $WANDB_API_KEY
28 | 
29 | export WANDB_BASE_URL=https://api.wandb.ai
30 | export WANDB_PROJECT=vision-reasoning
31 | export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_2>"
32 | export WANDB_RUN_NAME=Qwen-VL-2B-GRPO-$(date +%Y-%m-%d-%H-%M-%S)
33 | wandb login $WANDB_API_KEY
34 | 
35 | cd /home/tiger/multimodal-open-r1
36 | # pip3 install vllm==0.6.6.post1
37 | pip3 install -e ".[dev]"
38 | pip3 install wandb==0.18.3
39 | 
40 | torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" \
41 |     --nnodes="${ARNOLD_WORKER_NUM}" \
42 |     --node_rank="${ARNOLD_ID}" \
43 |     --master_addr="${METIS_WORKER_0_HOST}" \
44 |     --master_port="${port_in_cmd}" \
45 |     src/open_r1/grpo.py \
46 |     --deepspeed scripts/zero3.json \
47 |     --output_dir checkpoints/${WANDB_RUN_NAME} \
48 |     --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \
49 |     --dataset_name luodian/${DATASET_NAME} \
50 |     --max_prompt_length 8192 \
51 |     --per_device_train_batch_size 1 \
52 |     --gradient_accumulation_steps 1 \
53 |     --logging_steps 1 \
54 |     --bf16 \
55 |     --report_to wandb \
56 |     --gradient_checkpointing true \
57 |     --attn_implementation flash_attention_2 \
58 |     --max_pixels 2359296 \
59 |     --save_total_limit 8 \
60 |     --num_train_epochs 1 \
61 |     --run_name $WANDB_RUN_NAME
62 | 


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/zero1_no_optimizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"zero_optimization": {
 3 | 		"stage": 1,
 4 | 		"allgather_partitions": true,
 5 | 		"allgather_bucket_size": 1e9,
 6 | 		"overlap_comm": false,
 7 | 		"reduce_scatter": true,
 8 | 		"reduce_bucket_size": 1e9,
 9 | 		"contiguous_gradients": true
10 | 	},
11 | 	"fp16": {
12 | 		"enabled": "auto",
13 | 		"auto_cast": true,
14 | 		"loss_scale": 0,
15 | 		"initial_scale_power": 32,
16 | 		"loss_scale_window": 1000,
17 | 		"hysteresis": 2,
18 | 		"min_loss_scale": 1
19 | 	},
20 | 	"bf16": {
21 | 		"enabled": "auto"
22 | 	},
23 | 	"gradient_accumulation_steps": "auto",
24 | 	"gradient_clipping": "auto",
25 | 	"steps_per_print": 1,
26 | 	"train_batch_size": "auto",
27 | 	"train_micro_batch_size_per_gpu": "auto",
28 | 	"wall_clock_breakdown": true
29 | }


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": false,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 | 
14 |     "zero_optimization": {
15 |         "stage": 3,
16 |         "offload_optimizer": {
17 |             "device": "none",
18 |             "pin_memory": false
19 |         },
20 |         "offload_param": {
21 |             "device": "none",
22 |             "pin_memory": false
23 |         },
24 |         "overlap_comm": true,
25 |         "contiguous_gradients": true,
26 |         "sub_group_size": 1e9,
27 |         "reduce_bucket_size": "auto",
28 |         "stage3_prefetch_bucket_size": "auto",
29 |         "stage3_param_persistence_threshold": "auto",
30 |         "stage3_max_live_parameters": 1e9,
31 |         "stage3_max_reuse_distance": 1e9,
32 |         "stage3_gather_16bit_weights_on_model_save": true
33 |     },
34 | 
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/R1-V/src/r1-v/local_scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "cpu",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "cpu",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": true,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "train_batch_size": "auto",
45 |     "train_micro_batch_size_per_gpu": "auto",
46 |     "steps_per_print": 1e5,
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------
/R1-V/src/r1-v/run_grpo.sh:
--------------------------------------------------------------------------------
 1 | cd src/r1-v
 2 | 
 3 | export DEBUG_MODE="true"
 4 | export LOG_PATH="./debug_log_2b.txt"
 5 | 
 6 | 
 7 | 
 8 | torchrun --nproc_per_node="8" \
 9 |     --nnodes="1" \
10 |     --node_rank="0" \
11 |     --master_addr="127.0.0.1" \
12 |     --master_port="12345" \
13 |     src/open_r1/grpo.py \
14 |     --output_dir <OUTPUT_DIR> \
15 |     --model_name_or_path <PATH-TO-Qwen2-VL-2B-Instruct> \
16 |     --dataset_name <PATH-TO-DATASET> \
17 |     --max_prompt_length 1024 \
18 |     --per_device_train_batch_size 1 \
19 |     --gradient_accumulation_steps 2 \
20 |     --logging_steps 1 \
21 |     --bf16 \
22 |     --report_to wandb \
23 |     --gradient_checkpointing false \
24 |     --attn_implementation flash_attention_2 \
25 |     --max_pixels 401408 \
26 |     --num_train_epochs 2 \
27 |     --run_name Qwen2-VL-2B-GRPO-CLEVR-70k \
28 |     --save_steps 100 \
29 |     --save_only_model true


--------------------------------------------------------------------------------
/R1-V/src/r1-v/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/R1-V/src/r1-v/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/src/r1-v/src/open_r1/__init__.py


--------------------------------------------------------------------------------
/R1-V/src/r1-v/src/open_r1/trainer/__init__.py:
--------------------------------------------------------------------------------
 1 | from .grpo_trainer import Qwen2VLGRPOTrainer
 2 | from .vllm_grpo_trainer import Qwen2VLGRPOVLLMTrainer 
 3 | from .vllm_grpo_trainer_modified import Qwen2VLGRPOVLLMTrainerModified
 4 | 
 5 | __all__ = [
 6 |     "Qwen2VLGRPOTrainer", 
 7 |     "Qwen2VLGRPOVLLMTrainer",
 8 |     "Qwen2VLGRPOVLLMTrainerModified"
 9 | ]
10 | 


--------------------------------------------------------------------------------
/R1-V/src/r1-v/temp_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/src/r1-v/temp_image.png


--------------------------------------------------------------------------------
/R1-V/src/scripts/run_grpo_GEOQA_qwen2.5_3b.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | pg_name="gpg"
 4 | adjust_gd="true"
 5 | min_inverse_alpha="0.4"
 6 | 
 7 | # Wandb
 8 | export WANDB_PROJECT="R1-V"
 9 | 
10 | DATA_PATH=leonardPKU/GEOQA_R1V_Train_8K
11 | CKPT_PATH=Qwen2.5-VL-3B-Instruct
12 | 
13 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
14 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
15 | 
16 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
17 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
18 | 
19 | mkdir -p ${SAVE_PATH}
20 | 
21 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \
22 |     --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \
23 |     src/r1-v/src/open_r1/grpo.py \
24 |     --output_dir ${SAVE_PATH} \
25 |     --model_name_or_path ${CKPT_PATH} \
26 |     --dataset_name ${DATA_PATH} \
27 |     --deepspeed src/r1-v/local_scripts/zero3.json \
28 |     --pg_name ${pg_name} \
29 |     --adjust_gd ${adjust_gd} \
30 |     --min_inverse_alpha ${min_inverse_alpha} \
31 |     --max_prompt_length 1024 \
32 |     --max_completion_length 256 \
33 |     --per_device_train_batch_size 1 \
34 |     --gradient_accumulation_steps 2 \
35 |     --logging_steps 1 \
36 |     --bf16 \
37 |     --report_to wandb \
38 |     --gradient_checkpointing false \
39 |     --attn_implementation flash_attention_2 \
40 |     --max_pixels 401408 \
41 |     --num_train_epochs 1 \
42 |     --run_name "${RUN_NAME}" \
43 |     --save_steps 100 \
44 |     --save_only_model true \
45 |     --num_generations 8 \
46 |     --learning_rate 1e-6 \
47 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
48 | 


--------------------------------------------------------------------------------
/R1-V/test.sh:
--------------------------------------------------------------------------------
 1 | export MASTER_ADDR=127.0.0.1
 2 | export MASTER_PORT=21231
 3 | export WORLD_SIZE=1
 4 | export RANK=0
 5 | export GPUS=2
 6 | 
 7 | timestamp=$(date "+%Y%m%d%H%M%S")
 8 | 
 9 | OMP_NUM_THREADS=4 bash ./scripts/run_grpo_clevr.sh ${timestamp}
10 | 


--------------------------------------------------------------------------------
/Visual-RFT/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | output/
3 | output_onlypg/
4 | output_grpo/


--------------------------------------------------------------------------------
/Visual-RFT/assets/case_cls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/case_cls.png


--------------------------------------------------------------------------------
/Visual-RFT/assets/case_lisa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/case_lisa.png


--------------------------------------------------------------------------------
/Visual-RFT/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/framework.png


--------------------------------------------------------------------------------
/Visual-RFT/assets/pokeymon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/pokeymon.jpg


--------------------------------------------------------------------------------
/Visual-RFT/assets/radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/radar.png


--------------------------------------------------------------------------------
/Visual-RFT/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/teaser.png


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/fgvc_aircraft.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/fgvc_aircraft.pth


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/fgvc_aircraft.txt:
--------------------------------------------------------------------------------
  1 | 707-320
  2 | 727-200
  3 | 737-200
  4 | 737-300
  5 | 737-400
  6 | 737-500
  7 | 737-600
  8 | 737-700
  9 | 737-800
 10 | 737-900
 11 | 747-100
 12 | 747-200
 13 | 747-300
 14 | 747-400
 15 | 757-200
 16 | 757-300
 17 | 767-200
 18 | 767-300
 19 | 767-400
 20 | 777-200
 21 | 777-300
 22 | A300B4
 23 | A310
 24 | A318
 25 | A319
 26 | A320
 27 | A321
 28 | A330-200
 29 | A330-300
 30 | A340-200
 31 | A340-300
 32 | A340-500
 33 | A340-600
 34 | A380
 35 | ATR-42
 36 | ATR-72
 37 | An-12
 38 | BAE 146-200
 39 | BAE 146-300
 40 | BAE-125
 41 | Beechcraft 1900
 42 | Boeing 717
 43 | C-130
 44 | C-47
 45 | CRJ-200
 46 | CRJ-700
 47 | CRJ-900
 48 | Cessna 172
 49 | Cessna 208
 50 | Cessna 525
 51 | Cessna 560
 52 | Challenger 600
 53 | DC-10
 54 | DC-3
 55 | DC-6
 56 | DC-8
 57 | DC-9-30
 58 | DH-82
 59 | DHC-1
 60 | DHC-6
 61 | DHC-8-100
 62 | DHC-8-300
 63 | DR-400
 64 | Dornier 328
 65 | E-170
 66 | E-190
 67 | E-195
 68 | EMB-120
 69 | ERJ 135
 70 | ERJ 145
 71 | Embraer Legacy 600
 72 | Eurofighter Typhoon
 73 | F-16A/B
 74 | F/A-18
 75 | Falcon 2000
 76 | Falcon 900
 77 | Fokker 100
 78 | Fokker 50
 79 | Fokker 70
 80 | Global Express
 81 | Gulfstream IV
 82 | Gulfstream V
 83 | Hawk T1
 84 | Il-76
 85 | L-1011
 86 | MD-11
 87 | MD-80
 88 | MD-87
 89 | MD-90
 90 | Metroliner
 91 | Model B200
 92 | PA-28
 93 | SR-20
 94 | Saab 2000
 95 | Saab 340
 96 | Spitfire
 97 | Tornado
 98 | Tu-134
 99 | Tu-154
100 | Yak-42


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/oxford_flowers.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/oxford_flowers.pth


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/oxford_flowers.txt:
--------------------------------------------------------------------------------
  1 | pink primrose
  2 | hard-leaved pocket orchid
  3 | canterbury bells
  4 | sweet pea
  5 | english marigold
  6 | tiger lily
  7 | moon orchid
  8 | bird of paradise
  9 | monkshood
 10 | globe thistle
 11 | snapdragon
 12 | colts foot
 13 | king protea
 14 | spear thistle
 15 | yellow iris
 16 | globe-flower
 17 | purple coneflower
 18 | peruvian lily
 19 | balloon flower
 20 | giant white arum lily
 21 | fire lily
 22 | pincushion flower
 23 | fritillary
 24 | red ginger
 25 | grape hyacinth
 26 | corn poppy
 27 | prince of wales feathers
 28 | stemless gentian
 29 | artichoke
 30 | sweet william
 31 | carnation
 32 | garden phlox
 33 | love in the mist
 34 | mexican aster
 35 | alpine sea holly
 36 | ruby-lipped cattleya
 37 | cape flower
 38 | great masterwort
 39 | siam tulip
 40 | lenten rose
 41 | barbeton daisy
 42 | daffodil
 43 | sword lily
 44 | poinsettia
 45 | bolero deep blue
 46 | wallflower
 47 | marigold
 48 | buttercup
 49 | oxeye daisy
 50 | common dandelion
 51 | petunia
 52 | wild pansy
 53 | primula
 54 | sunflower
 55 | pelargonium
 56 | bishop of llandaff
 57 | gaura
 58 | geranium
 59 | orange dahlia
 60 | pink-yellow dahlia
 61 | cautleya spicata
 62 | japanese anemone
 63 | black-eyed susan
 64 | silverbush
 65 | californian poppy
 66 | osteospermum
 67 | spring crocus
 68 | bearded iris
 69 | windflower
 70 | tree poppy
 71 | gazania
 72 | azalea
 73 | water lily
 74 | rose
 75 | thorn apple
 76 | morning glory
 77 | passion flower
 78 | lotus
 79 | toad lily
 80 | anthurium
 81 | frangipani
 82 | clematis
 83 | hibiscus
 84 | columbine
 85 | desert-rose
 86 | tree mallow
 87 | magnolia
 88 | cyclamen
 89 | watercress
 90 | canna lily
 91 | hippeastrum
 92 | bee balm
 93 | ball moss
 94 | foxglove
 95 | bougainvillea
 96 | camellia
 97 | mallow
 98 | mexican petunia
 99 | bromelia
100 | blanket flower
101 | trumpet creeper
102 | blackberry lily


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/pets.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/pets.pth


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/pets.txt:
--------------------------------------------------------------------------------
 1 | abyssinian
 2 | american_bulldog
 3 | american_pit_bull_terrier
 4 | basset_hound
 5 | beagle
 6 | bengal
 7 | birman
 8 | bombay
 9 | boxer
10 | british_shorthair
11 | chihuahua
12 | egyptian_mau
13 | english_cocker_spaniel
14 | english_setter
15 | german_shorthaired
16 | great_pyrenees
17 | havanese
18 | japanese_chin
19 | keeshond
20 | leonberger
21 | maine_coon
22 | miniature_pinscher
23 | newfoundland
24 | persian
25 | pomeranian
26 | pug
27 | ragdoll
28 | russian_blue
29 | saint_bernard
30 | samoyed
31 | scottish_terrier
32 | shiba_inu
33 | siamese
34 | sphynx
35 | staffordshire_bull_terrier
36 | wheaten_terrier
37 | yorkshire_terrier


--------------------------------------------------------------------------------
/Visual-RFT/classification/val_data/stanford_cars.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/stanford_cars.pth


--------------------------------------------------------------------------------
/Visual-RFT/dataset/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Visual-RFT/lisa_evaluation/Qwen2_VL_lisa_infere.sh:
--------------------------------------------------------------------------------
 1 | TASKS=("test" "val")
 2 | # Adjust to your gpu num
 3 | # GPU_IDS=(0 1 2 3 4 5 6 7)
 4 | # SPLIT_NUM=8
 5 | GPU_IDS=(0 1)
 6 | SPLIT_NUM=2
 7 | 
 8 | for task in "${TASKS[@]}"; do
 9 |     echo "Starting inference for task: $task"
10 | 
11 |     # 遍历 GPU 和 SPLIT
12 |     for i in "${!GPU_IDS[@]}"; do
13 |         GPU_ID=${GPU_IDS[$i]}
14 |         SPLIT=$i
15 |         echo "Launching task=$task on GPU=$GPU_ID with SPLIT=$SPLIT"
16 |         SPLIT=$SPLIT SPLIT_NUM=$SPLIT_NUM python Qwen2_VL_lisa_infere.py \
17 |             --task $task &
18 |         sleep 1
19 |     done
20 |     wait
21 |     echo "Merging results for task: $task"
22 |     SPLIT_NUM=$SPLIT_NUM python merge_eval.py >> res.txt
23 | done
24 | 
25 | echo "All tasks completed!"
26 | 


--------------------------------------------------------------------------------
/Visual-RFT/lisa_evaluation/README.md:
--------------------------------------------------------------------------------
 1 | ## ViRFT for reasoning grounding
 2 | 
 3 | ## training
 4 | 1. Download [LISA dataset](https://github.com/dvlab-research/LISA)
 5 | 2. use `gen_box_ann.py` to generate box from mask.
 6 | 3. use `gen_sft.py` to generate SFT/Visual-RFT training annotations.
 7 | 4. use `src/scripts/2B_lisa_grounding.sh` to train the model, with annotation path changed to step.3 generated annotations.
 8 | 
 9 | After training model, replace model path in `Qwen2_VL_lisa_infere.py` with your own ckpt.
10 | 
11 | ```python
12 | # Load Qwen2-VL-2B model and processor
13 | model = Qwen2VLForConditionalGeneration.from_pretrained(
14 |     "/path/to/your/checkpoint-498", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2"
15 | ).eval()
16 | 
17 | processor = AutoProcessor.from_pretrained("/path/to/your/checkpoint-498")
18 | ```
19 | 
20 | to compute gIoU, follow the process bellow.
21 | 1. Use `box2mask.py` to extract mask from [SAM](https://github.com/facebookresearch/segment-anything)
22 | 2. Use `mask_iou` to comput mask IoU.
23 | 
24 | ```shell
25 | cd lisa_evaluation
26 | bash Qwen2_VL_lisa_infere.sh
27 | ```
28 | 


--------------------------------------------------------------------------------
/Visual-RFT/lisa_evaluation/gen_box_ann.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from PIL import Image, ImageDraw
 4 | 
 5 | res = []
 6 | # base_path = "dataset/reason_seg/ReasonSeg/train"
 7 | # base_path = "dataset/reason_seg/ReasonSeg/val"
 8 | base_path = "dataset/reason_seg/ReasonSeg/test"
 9 | 
10 | for pth in os.listdir(base_path):
11 |     if pth.endswith(".json"):
12 |         json_path = os.path.join(base_path, pth)
13 | 
14 |         with open(json_path, 'r') as f:
15 |             item = json.load(f)
16 | 
17 |         instruct = item["text"]
18 |         shapes = item["shapes"]
19 | 
20 |         boxes = []
21 |         for shape in shapes[:1]:
22 |             points = shape["points"]
23 |             x_coords = [p[0] for p in points]
24 |             y_coords = [p[1] for p in points]
25 | 
26 |             x_min, x_max = min(x_coords), max(x_coords)
27 |             y_min, y_max = min(y_coords), max(y_coords)
28 |             boxes.append((x_min, y_min, x_max, y_max))
29 | 
30 |         img_path = json_path.replace(".json", ".jpg")
31 |         if os.path.exists(img_path):
32 |             res.append({
33 |                 "image_path": img_path,
34 |                 "instruction": instruct,
35 |                 "boxes": boxes
36 |             })
37 | 
38 | # json.dump(res, open("lisa_train.json", 'w'), indent=4)
39 | # json.dump(res, open("lisa_val.json", 'w'), indent=4)
40 | json.dump(res, open("lisa_test.json", 'w'), indent=4)
41 | 


--------------------------------------------------------------------------------
/Visual-RFT/lisa_evaluation/gen_sft.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from PIL import Image
 4 | res = []
 5 | index = 0
 6 | for i, item in enumerate(json.load(open("./lisa_train.json", 'r'))):
 7 |     for instruct in item['instruction']:
 8 |         w, h= Image.open(item['image_path']).size
 9 |         res.append({
10 |             "id": f"lisa_{index}",
11 |             "conversations": [
12 |                 {
13 |                     "from": "user",
14 |                     "value": f"<img>{item['image_path']}</img>\n Output the bounding box in the image corresponding to the instruction: {instruct}"
15 |                 },
16 |                 {
17 |                     "from": "assistant",
18 |                     "value": f"({int(item['boxes'][0][0] / w * 1000)},{int(item['boxes'][0][1] / h * 1000)}),({int(item['boxes'][0][2] / w * 1000)},{int(item['boxes'][0][3] / h * 1000)})"
19 |                 }
20 |             ]
21 |         })
22 |         index += 1
23 | json.dump(res, open("lisa_train_sft.json", 'w'), indent=4)
24 | 


--------------------------------------------------------------------------------
/Visual-RFT/lisa_evaluation/merge_eval.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | merged = []
4 | for i in range(int(os.environ['SPLIT_NUM'])):
5 |     data = json.load(open(f"tmp/res_{i}.json", 'r'))
6 |     merged += data
7 | print(f"mIoU: {sum(merged) / len(merged)}")
8 | 


--------------------------------------------------------------------------------
/Visual-RFT/q&a.md:
--------------------------------------------------------------------------------
1 | 
2 | 训练时问题，解决方式：
3 | https://github.com/Liuziyu77/Visual-RFT/issues/17#issuecomment-2702690782


--------------------------------------------------------------------------------
/Visual-RFT/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=1.2.1
 2 | bitsandbytes>=0.43.0
 3 | black>=24.4.2
 4 | datasets>=3.2.0
 5 | deepspeed==0.15.4
 6 | distilabel[vllm,ray,openai]>=1.5.2
 7 | einops>=0.8.0 
 8 | flake8>=6.0.0
 9 | hf_transfer>=0.1.4
10 | huggingface-hub[cli]>=0.19.2,<1.0
11 | isort>=5.12.0
12 | liger_kernel==0.5.2
13 | # lighteval @ git+https://githubfast.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math]
14 | math-verify
15 | packaging>=23.0
16 | parameterized>=0.9.0
17 | pytest
18 | safetensors>=0.3.3
19 | sentencepiece>=0.1.99
20 | torch>=2.5.1
21 | transformers @ git+https://github.com/huggingface/transformers.git@main
22 | trl @ git+https://github.com/huggingface/trl.git@main
23 | vllm==0.6.6.post1


--------------------------------------------------------------------------------
/Visual-RFT/setup.sh:
--------------------------------------------------------------------------------
 1 | cd src/virft
 2 | pip install -e ".[dev]"
 3 | 
 4 | # Addtional modules
 5 | pip install wandb==0.18.3
 6 | pip install tensorboardx
 7 | pip install qwen_vl_utils torchvision
 8 | # pip install flash-attn --no-build-isolation
 9 | pip install flash-attn==2.6.3 --no-build-isolation
10 | 
11 | # vLLM support 
12 | pip install vllm==0.7.2
13 | 
14 | # fix transformers version
15 | pip install git+https://github.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef
16 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/scripts/2B_aircraft_4_shot.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | # pg_name="grpo"
 4 | pg_name="gpg"
 5 | 
 6 | adjust_gd="true"
 7 | 
 8 | # Wandb
 9 | export WANDB_PROJECT="visual-rft"
10 | 
11 | DATA_PATH=laolao77/ViRFT_CLS_fgvc_aircraft_4_shot
12 | CKPT_PATH=Qwen2-VL-2B-Instruct
13 | 
14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
16 | 
17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
19 | 
20 | mkdir -p ${SAVE_PATH}
21 | 
22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \
23 |     --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \
24 |     src/virft/src/open_r1/grpo_classification.py \
25 |     --output_dir ${SAVE_PATH}  \
26 |     --model_name_or_path ${CKPT_PATH} \
27 |     --dataset_name ${DATA_PATH} \
28 |     --deepspeed src/virft/local_scripts/zero3.json \
29 |     --pg_name ${pg_name} \
30 |     --adjust_gd ${adjust_gd} \
31 |     --temperature 0.9 \
32 |     --max_prompt_length 1024 \
33 |     --per_device_train_batch_size 1 \
34 |     --gradient_accumulation_steps 2 \
35 |     --logging_steps 1 \
36 |     --bf16 \
37 |     --report_to wandb \
38 |     --gradient_checkpointing false \
39 |     --attn_implementation flash_attention_2 \
40 |     --max_pixels 401408 \
41 |     --num_train_epochs 8 \
42 |     --run_name "${RUN_NAME}" \
43 |     --save_steps 100 \
44 |     --save_only_model true \
45 |     --num_generations 8 \
46 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
47 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/scripts/2B_car196_4_shot.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | # pg_name="grpo"
 4 | pg_name="gpg"
 5 | 
 6 | adjust_gd="true"
 7 | 
 8 | # Wandb
 9 | export WANDB_PROJECT="visual-rft"
10 | 
11 | DATA_PATH=laolao77/ViRFT_CLS_car196_4shot
12 | CKPT_PATH=Qwen2-VL-2B-Instruct
13 | 
14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
16 | 
17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
19 | 
20 | mkdir -p ${SAVE_PATH}
21 | 
22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \
23 |     --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \
24 |     src/virft/src/open_r1/grpo_classification.py \
25 |     --output_dir ${SAVE_PATH}  \
26 |     --model_name_or_path ${CKPT_PATH} \
27 |     --dataset_name ${DATA_PATH} \
28 |     --deepspeed src/virft/local_scripts/zero3.json \
29 |     --pg_name ${pg_name} \
30 |     --adjust_gd ${adjust_gd} \
31 |     --temperature 0.9 \
32 |     --max_prompt_length 1024 \
33 |     --per_device_train_batch_size 1 \
34 |     --gradient_accumulation_steps 2 \
35 |     --logging_steps 1 \
36 |     --bf16 \
37 |     --report_to wandb \
38 |     --gradient_checkpointing false \
39 |     --attn_implementation flash_attention_2 \
40 |     --max_pixels 401408 \
41 |     --num_train_epochs 8 \
42 |     --run_name "${RUN_NAME}" \
43 |     --save_steps 100 \
44 |     --save_only_model true \
45 |     --num_generations 8 \
46 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
47 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/scripts/2B_flower_4_shot.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | # pg_name="grpo"
 4 | pg_name="gpg"
 5 | 
 6 | adjust_gd="true"
 7 | 
 8 | # Wandb
 9 | export WANDB_PROJECT="visual-rft"
10 | 
11 | DATA_PATH=laolao77/ViRFT_CLS_flower_4_shot
12 | CKPT_PATH=Qwen2-VL-2B-Instruct
13 | 
14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
16 | 
17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
19 | 
20 | mkdir -p ${SAVE_PATH}
21 | 
22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \
23 |     --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \
24 |     src/virft/src/open_r1/grpo_classification.py \
25 |     --output_dir ${SAVE_PATH}  \
26 |     --model_name_or_path ${CKPT_PATH} \
27 |     --dataset_name ${DATA_PATH} \
28 |     --deepspeed src/virft/local_scripts/zero3.json \
29 |     --pg_name ${pg_name} \
30 |     --adjust_gd ${adjust_gd} \
31 |     --temperature 0.9 \
32 |     --max_prompt_length 1024 \
33 |     --per_device_train_batch_size 1 \
34 |     --gradient_accumulation_steps 2 \
35 |     --logging_steps 1 \
36 |     --bf16 \
37 |     --report_to wandb \
38 |     --gradient_checkpointing false \
39 |     --attn_implementation flash_attention_2 \
40 |     --max_pixels 401408 \
41 |     --num_train_epochs 8 \
42 |     --run_name "${RUN_NAME}" \
43 |     --save_steps 100 \
44 |     --save_only_model true \
45 |     --num_generations 8 \
46 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
47 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/scripts/2B_lisa_grounding.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | # pg_name="grpo"
 4 | pg_name="gpg"
 5 | 
 6 | adjust_gd="true"
 7 | 
 8 | # Wandb
 9 | export WANDB_PROJECT="visual-rft"
10 | 
11 | DATA_PATH=NOT_USED
12 | CKPT_PATH=Qwen2-VL-2B-Instruct
13 | 
14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
16 | 
17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
19 | 
20 | mkdir -p ${SAVE_PATH}
21 | 
22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \
23 |     --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \
24 |     src/virft/src/open_r1/grpo_lisa.py \
25 |     --output_dir ${SAVE_PATH} \
26 |     --model_name_or_path ${CKPT_PATH} \
27 |     --dataset_name NOT_USED \
28 |     --deepspeed src/virft/local_scripts/zero3.json \
29 |     --pg_name ${pg_name} \
30 |     --adjust_gd ${adjust_gd} \
31 |     --temperature 0.9 \
32 |     --max_prompt_length 1024 \
33 |     --per_device_train_batch_size 1 \
34 |     --gradient_accumulation_steps 2 \
35 |     --logging_steps 1 \
36 |     --bf16 \
37 |     --report_to wandb \
38 |     --gradient_checkpointing true \
39 |     --attn_implementation flash_attention_2 \
40 |     --max_pixels 401408 \
41 |     --num_train_epochs 6 \
42 |     --run_name "${RUN_NAME}" \
43 |     --save_steps 50 \
44 |     --save_only_model true \
45 |     --num_generations 8 \
46 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
47 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/scripts/2B_pets37_4_shot.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | # pg_name="grpo"
 4 | pg_name="gpg"
 5 | 
 6 | adjust_gd="true"
 7 | 
 8 | # Wandb
 9 | export WANDB_PROJECT="visual-rft"
10 | 
11 | DATA_PATH=laolao77/ViRFT_CLS_pets37_4shot
12 | CKPT_PATH=Qwen2-VL-2B-Instruct
13 | 
14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
16 | 
17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
19 | 
20 | mkdir -p ${SAVE_PATH}
21 | 
22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \
23 |     --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \
24 |     src/virft/src/open_r1/grpo_classification.py \
25 |     --output_dir ${SAVE_PATH}  \
26 |     --model_name_or_path ${CKPT_PATH} \
27 |     --dataset_name ${DATA_PATH} \
28 |     --deepspeed src/virft/local_scripts/zero3.json \
29 |     --pg_name ${pg_name} \
30 |     --adjust_gd ${adjust_gd} \
31 |     --temperature 0.9 \
32 |     --max_prompt_length 1024 \
33 |     --per_device_train_batch_size 1 \
34 |     --gradient_accumulation_steps 2 \
35 |     --logging_steps 1 \
36 |     --bf16 \
37 |     --report_to wandb \
38 |     --gradient_checkpointing false \
39 |     --attn_implementation flash_attention_2 \
40 |     --max_pixels 401408 \
41 |     --num_train_epochs 24 \
42 |     --run_name "${RUN_NAME}" \
43 |     --save_steps 100 \
44 |     --save_only_model true \
45 |     --num_generations 8 \
46 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
47 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style quality
 2 | 
 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 4 | export PYTHONPATH = src
 5 | 
 6 | check_dirs := src
 7 | 
 8 | style:
 9 | 	black --line-length 119 --target-version py310 $(check_dirs) setup.py
10 | 	isort $(check_dirs) setup.py
11 | 
12 | quality:
13 | 	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
14 | 	isort --check-only $(check_dirs) setup.py
15 | 	flake8 --max-line-length 119 $(check_dirs) setup.py
16 | 
17 | 
18 | # Evaluation
19 | 
20 | evaluate:
21 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/README.md:
--------------------------------------------------------------------------------
1 | # Visual-RFT
2 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/local_scripts/train_qwen2_vl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export NCCL_BLOCKING_WAIT=0
 4 | export TOKENIZERS_PARALLELISM=false
 5 | export OMP_NUM_THREADS=8
 6 | export NCCL_IB_DISABLE=0
 7 | export NCCL_IB_GID_INDEX=3
 8 | export NCCL_SOCKET_IFNAME=eth0
 9 | export NCCL_DEBUG=INFO
10 | 
11 | GPUS="0,1,2,3,4,5,6,7"
12 | 
13 | # 取 worker0 第一个 port
14 | ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' '))
15 | port=${ports[0]}
16 | port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2000}" | awk -F',' '{print $1}')"
17 | 
18 | echo "total workers: ${ARNOLD_WORKER_NUM}"
19 | echo "cur worker id: ${ARNOLD_ID}"
20 | echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
21 | echo "master ip: ${METIS_WORKER_0_HOST}"
22 | echo "master port: ${port}"
23 | echo "master port in cmd: ${port_in_cmd}"
24 | 
25 | # export WANDB_BASE_URL=https://api.wandb.ai
26 | # export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_1>"
27 | # wandb login $WANDB_API_KEY
28 | 
29 | export WANDB_BASE_URL=https://api.wandb.ai
30 | export WANDB_PROJECT=vision-reasoning
31 | export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_2>"
32 | export WANDB_RUN_NAME=Qwen-VL-2B-GRPO-$(date +%Y-%m-%d-%H-%M-%S)
33 | wandb login $WANDB_API_KEY
34 | 
35 | cd /home/tiger/multimodal-open-r1
36 | # pip3 install vllm==0.6.6.post1
37 | pip3 install -e ".[dev]"
38 | pip3 install wandb==0.18.3
39 | 
40 | torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" \
41 |     --nnodes="${ARNOLD_WORKER_NUM}" \
42 |     --node_rank="${ARNOLD_ID}" \
43 |     --master_addr="${METIS_WORKER_0_HOST}" \
44 |     --master_port="${port_in_cmd}" \
45 |     src/open_r1/grpo.py \
46 |     --deepspeed scripts/zero3.json \
47 |     --output_dir checkpoints/${WANDB_RUN_NAME} \
48 |     --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \
49 |     --dataset_name luodian/${DATASET_NAME} \
50 |     --max_prompt_length 8192 \
51 |     --per_device_train_batch_size 1 \
52 |     --gradient_accumulation_steps 1 \
53 |     --logging_steps 1 \
54 |     --bf16 \
55 |     --report_to wandb \
56 |     --gradient_checkpointing true \
57 |     --attn_implementation flash_attention_2 \
58 |     --max_pixels 2359296 \
59 |     --save_total_limit 8 \
60 |     --num_train_epochs 1 \
61 |     --run_name $WANDB_RUN_NAME
62 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/local_scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": false,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/local_scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 | 
14 |     "zero_optimization": {
15 |         "stage": 3,
16 |         "offload_optimizer": {
17 |             "device": "none",
18 |             "pin_memory": true
19 |         },
20 |         "offload_param": {
21 |             "device": "none",
22 |             "pin_memory": true
23 |         },
24 |         "overlap_comm": true,
25 |         "contiguous_gradients": true,
26 |         "sub_group_size": 1e9,
27 |         "reduce_bucket_size": "auto",
28 |         "stage3_prefetch_bucket_size": "auto",
29 |         "stage3_param_persistence_threshold": "auto",
30 |         "stage3_max_live_parameters": 1e9,
31 |         "stage3_max_reuse_distance": 1e9,
32 |         "stage3_gather_16bit_weights_on_model_save": true
33 |     },
34 | 
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/local_scripts/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/local_scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "cpu",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "cpu",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": true,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "train_batch_size": "auto",
45 |     "train_micro_batch_size_per_gpu": "auto",
46 |     "steps_per_print": 1e5,
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/slurm/evaluate.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=open-r1-evaluate
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --exclusive
 6 | #SBATCH --gres=gpu:8
 7 | #SBATCH --partition=hopper-prod 
 8 | #SBATCH --time=01:59:00
 9 | #SBATCH --output=./logs/evaluate/%x-%j.out
10 | #SBATCH --err=./logs/evaluate/%x-%j.err
11 | 
12 | # Usage: sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B aime24
13 | 
14 | set -x -e
15 | 
16 | source ~/.bashrc
17 | conda activate openr1
18 | module load cuda/12.1
19 | echo "START TIME: $(date)"
20 | echo "PYTHON ENV: $(which python)"
21 | 
22 | 
23 | NUM_GPUS=8
24 | MODEL=$1
25 | TASK=$2
26 | MODEL_ARGS="pretrained=$MODEL,dtype=float16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
27 | OUTPUT_DIR=data/evals/$MODEL
28 | 
29 | 
30 | # force crashing on nccl issues like hanging broadcast
31 | export NCCL_ASYNC_ERROR_HANDLING=1
32 | # export NCCL_DEBUG=INFO
33 | # export NCCL_DEBUG_SUBSYS=COLL
34 | # export NCCL_SOCKET_NTHREADS=1
35 | # export NCCL_NSOCKS_PERTHREAD=1
36 | # export CUDA_LAUNCH_BLOCKING=1
37 | 
38 | # Specific configuration optimized for the Hugging Face Compute Cluster
39 | # Be ye warned this may not work on other clusters!
40 | module load cuda/12.1
41 | 
42 | lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
43 |     --custom-tasks src/open_r1/evaluate.py \
44 |     --use-chat-template \
45 |     --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
46 |     --output-dir $OUTPUT_DIR 
47 | 
48 | 
49 | echo "END TIME: $(date)"
50 | 


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/src/virft/src/open_r1/__init__.py


--------------------------------------------------------------------------------
/Visual-RFT/src/virft/src/open_r1/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .grpo_trainer import Qwen2VLGRPOTrainer
2 | from .vllm_grpo_trainer import Qwen2VLGRPOVLLMTrainer 
3 | 
4 | __all__ = ["Qwen2VLGRPOTrainer", "Qwen2VLGRPOVLLMTrainer"]
5 | 


--------------------------------------------------------------------------------
/Visual-RFT/test.sh:
--------------------------------------------------------------------------------
 1 | export MASTER_ADDR=127.0.0.1
 2 | export MASTER_PORT=21231
 3 | export WORLD_SIZE=1
 4 | export RANK=0
 5 | export GPUS=2
 6 | 
 7 | timestamp=$(date "+%Y%m%d%H%M%S")
 8 | 
 9 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_base65cate_6k.sh ${timestamp}
10 | 
11 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_aircraft_4_shot.sh ${timestamp}
12 | 
13 | OMP_NUM_THREADS=4 bash ./src/scripts/2B_lisa_grounding.sh ${timestamp}
14 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/.gitignore:
--------------------------------------------------------------------------------
 1 | *.log
 2 | *.tmp
 3 | *.csv
 4 | *.json
 5 | *.parquet
 6 | *.png
 7 | *.jpg
 8 | 
 9 | # dependency directories
10 | src/open-r1-multimodal/src/open_r1/__pycache__/
11 | 
12 | # Python cache
13 | __pycache__/
14 | 
15 | # Egg info
16 | *.egg-info/
17 | 
18 | # wandb
19 | src/open-r1-multimodal/wandb/
20 | 
21 | # folder
22 | src/open-r1-multimodal/trajectories/
23 | 
24 | # outputs
25 | src/open-r1-multimodal/output/
26 | output
27 | trajectories
28 | model


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=1.2.1
 2 | bitsandbytes>=0.43.0
 3 | black>=24.4.2
 4 | datasets>=3.2.0
 5 | deepspeed==0.15.4
 6 | distilabel[vllm,ray,openai]>=1.5.2
 7 | einops>=0.8.0
 8 | flake8>=6.0.0
 9 | hf_transfer>=0.1.4
10 | huggingface-hub[cli]>=0.19.2,<1.0
11 | isort>=5.12.0
12 | liger_kernel==0.5.2
13 | # lighteval @ git+https://githubfast.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math]
14 | math-verify
15 | packaging>=23.0
16 | parameterized>=0.9.0
17 | pytest
18 | safetensors>=0.3.3
19 | sentencepiece>=0.1.99
20 | torch>=2.5.1
21 | # transformers @ git+https://githubfast.com/huggingface/transformers.git@main
22 | trl==0.14.0
23 | vllm==0.6.6.post1
24 | wandb>=0.19.1
25 | pillow
26 | timm


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/setup.sh:
--------------------------------------------------------------------------------
 1 | conda create -n visual_thinker python=3.11 
 2 | conda activate visual_thinker
 3 | 
 4 | # Install the packages in open-r1-multimodal .
 5 | cd src/open-r1-multimodal
 6 | pip install -e ".[dev]"
 7 | 
 8 | # Addtional modules
 9 | pip install wandb==0.18.3
10 | pip install tensorboardx tensorboard
11 | pip install qwen_vl_utils torchvision
12 | pip install flash-attn --no-build-isolation
13 | 
14 | pip install transformers==4.49.0 # correct deepspeed support
15 | pip install duckdb
16 | pip install opencv-python
17 | pip install pandas
18 | pip install math_verify==0.5.2
19 | pip install datasets
20 | pip install accelerate
21 | pip install deepspeed
22 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/data/SAT/prepare_dataset.sh:
--------------------------------------------------------------------------------
1 | # Download the dataset parquet and rename it
2 | wget -O SAT_train.parquet "https://hf-mirror.com/datasets/array/SAT/resolve/main/SAT_train.parquet?download=true"
3 | 
4 | # Create the dataset directory
5 | mkdir -p SAT_images_train
6 | 
7 | # Process the dataset
8 | python process_dataset.py


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style quality
 2 | 
 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 4 | export PYTHONPATH = src
 5 | 
 6 | check_dirs := src
 7 | 
 8 | style:
 9 | 	black --line-length 119 --target-version py310 $(check_dirs) setup.py
10 | 	isort $(check_dirs) setup.py
11 | 
12 | quality:
13 | 	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
14 | 	isort --check-only $(check_dirs) setup.py
15 | 	flake8 --max-line-length 119 $(check_dirs) setup.py
16 | 
17 | 
18 | # Evaluation
19 | 
20 | evaluate:
21 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | # machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | # num_machines: 1
15 | # num_processes: 2
16 | # main_process_port: 44326
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 4
17 | main_process_port: 22316
18 | rdzv_backend: static
19 | same_network: true
20 | tpu_env: []
21 | tpu_use_cluster: false
22 | tpu_use_sudo: false
23 | use_cpu: false
24 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/prepare_2B_base.sh:
--------------------------------------------------------------------------------
1 | # Prepare base model with chat template for SFT training
2 | git lfs install
3 | git clone https://huggingface.co/Qwen/Qwen2-VL-2B
4 | mv Qwen2-VL-2B Qwen2-VL-2B-Base
5 | 
6 | huggingface-cli download Qwen/Qwen2-VL-2B-Instruct chat_template.json tokenizer_config.json --local-dir ./Qwen2-VL-2B-Base
7 | 
8 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/run_grpo.sh:
--------------------------------------------------------------------------------
 1 | export DEBUG_MODE="true"
 2 | export LOG_PATH="./debug_log_2b.txt"
 3 | 
 4 | 
 5 | 
 6 | torchrun --nproc_per_node="8" \
 7 |     --nnodes="1" \
 8 |     --node_rank="0" \
 9 |     --master_addr="127.0.0.1" \
10 |     --master_port="12345" \
11 |     src/open_r1/grpo.py \
12 |     --output_dir <OUTPUT_DIR> \
13 |     --model_name_or_path <PATH-TO-Qwen2-VL-2B-Instruct> \
14 |     --dataset_name <PATH-TO-DATASET> \
15 |     --max_prompt_length 1024 \
16 |     --per_device_train_batch_size 1 \
17 |     --gradient_accumulation_steps 2 \
18 |     --logging_steps 1 \
19 |     --bf16 \
20 |     --report_to wandb \
21 |     --gradient_checkpointing false \
22 |     --attn_implementation flash_attention_2 \
23 |     --max_pixels 401408 \
24 |     --num_train_epochs 2 \
25 |     --run_name Qwen2-VL-2B-GRPO-CLEVR-70k \
26 |     --save_steps 100 \
27 |     --save_only_model true


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/run_grpo_SAT.sh:
--------------------------------------------------------------------------------
 1 | timestamp=$1
 2 | echo "timestamp: ${timestamp}"
 3 | pg_name="gpg"
 4 | adjust_gd="true"
 5 | min_inverse_alpha="0.4"
 6 | 
 7 | # Wandb
 8 | export WANDB_PROJECT="VisualThinker-R1-Zero"
 9 | 
10 | DATA_PATH=SAT
11 | CKPT_PATH=Qwen2-VL-2B
12 | 
13 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp}
14 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}"
15 | mkdir -p ${SAVE_PATH}
16 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL
17 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt"
18 | # export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS-1)) | sed 's/,$//g')
19 | 
20 | accelerate launch --config_file=src/open-r1-multimodal/configs/zero2.yaml \
21 |     --main_process_ip ${MASTER_ADDR} --main_process_port ${MASTER_PORT} \
22 |     --num_machines ${WORLD_SIZE} --machine_rank ${RANK} --num_processes ${GPUS} \
23 |     src/open-r1-multimodal/src/open_r1/grpo.py \
24 |     --pg_name ${pg_name} \
25 |     --adjust_gd ${adjust_gd} \
26 |     --min_inverse_alpha ${min_inverse_alpha} \
27 |     --output_dir ${SAVE_PATH} \
28 |     --model_name_or_path ${CKPT_PATH} \
29 |     --dataset_name ${DATA_PATH} \
30 |     --max_prompt_length 1024 \
31 |     --max_completion_length 700 \
32 |     --per_device_train_batch_size 1 \
33 |     --gradient_accumulation_steps 1 \
34 |     --logging_steps 1 \
35 |     --bf16 \
36 |     --gradient_checkpointing 1 \
37 |     --attn_implementation flash_attention_2 \
38 |     --max_pixels 401408 \
39 |     --num_train_epochs 2 \
40 |     --run_name ${RUN_NAME} \
41 |     --save_steps 100 \
42 |     --save_only_model true \
43 |     --report_to wandb \
44 |     2>&1 | tee -a "./${SAVE_PATH}/training_log.log"
45 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/run_sft.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \
 4 |     --model_name_or_path <PATH-TO-Qwen2-VL-2B-Instruct> \
 5 |     --dataset_name <DATASET_NAME> \
 6 |     --learning_rate 2.0e-5 \
 7 |     --num_train_epochs 2 \
 8 |     --packing True \
 9 |     --max_seq_length 1024 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 4 \
12 |     --gradient_accumulation_steps 2 \
13 |     --gradient_checkpointing True \
14 |     --report_to wandb \
15 |     --bf16 True \
16 |     --logging_steps 5 \
17 |     --eval_strategy no \
18 |     --output_dir <OUTPUT_DIR> \
19 |     --run_name <RUN_NAME>


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/run_sft_SAT.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \
 4 |     --model_name_or_path Qwen2-VL-2B-Base \
 5 |     --dataset_name SAT \
 6 |     --learning_rate 2.0e-5 \
 7 |     --num_train_epochs 2 \
 8 |     --packing True \
 9 |     --max_seq_length 1024 \
10 |     --per_device_train_batch_size 1 \
11 |     --per_device_eval_batch_size 4 \
12 |     --gradient_accumulation_steps 2 \
13 |     --gradient_checkpointing True \
14 |     --report_to wandb \
15 |     --bf16 True \
16 |     --logging_steps 5 \
17 |     --eval_strategy no \
18 |     --save_steps 300 \
19 |     --output_dir outputs/Qwen2_VL-2B-SFT \
20 |     --run_name Qwen2_VL-2B-SFT-SAT


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/VisualThinker-R1-Zero/src/open-r1-multimodal/src/open_r1/__init__.py


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/src/open-r1-multimodal/src/open_r1/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .grpo_trainer import Qwen2VLGRPOTrainer
2 | 
3 | 
4 | __all__ = ["Qwen2VLGRPOTrainer"]
5 | 


--------------------------------------------------------------------------------
/VisualThinker-R1-Zero/test.sh:
--------------------------------------------------------------------------------
 1 | export MASTER_ADDR=127.0.0.1
 2 | export MASTER_PORT=21232
 3 | export WORLD_SIZE=1
 4 | export RANK=0
 5 | export GPUS=2
 6 | 
 7 | timestamp=$(date "+%Y%m%d%H%M%S")
 8 | 
 9 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_base65cate_6k.sh ${timestamp}
10 | 
11 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_aircraft_4_shot.sh ${timestamp}
12 | 
13 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_lisa_grounding.sh ${timestamp}
14 | OMP_NUM_THREADS=4 bash ./src/open-r1-multimodal/run_grpo_SAT.sh ${timestamp}
15 | 


--------------------------------------------------------------------------------
/docs/images/GPG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/docs/images/GPG.png


--------------------------------------------------------------------------------
/open-r1/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "weekly"
 7 |   - package-ecosystem: "github-actions"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "weekly"
11 | 


--------------------------------------------------------------------------------
/open-r1/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - v*-release
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 | 
14 |   tests:
15 |     name: Run tests and quality checks
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v4
20 |       - name: Setup Python environment
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: 3.10.10
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           python -m pip install ".[quality,tests]"
28 |       - name: Code quality
29 |         run: |
30 |           make quality
31 |       - name: Run tests
32 |         run: |
33 |           make test
34 | 
35 | 


--------------------------------------------------------------------------------
/open-r1/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style quality
 2 | 
 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 4 | export PYTHONPATH = src
 5 | 
 6 | check_dirs := src tests
 7 | 
 8 | 
 9 | # dev dependencies
10 | install:
11 | 	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
12 | 	uv pip install vllm==0.7.2
13 | 	uv pip install setuptools
14 | 	uv pip install flash-attn --no-build-isolation
15 | 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
16 | 
17 | style:
18 | 	ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
19 | 	isort $(check_dirs) setup.py
20 | 
21 | quality:
22 | 	ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py
23 | 	isort --check-only $(check_dirs) setup.py
24 | 	flake8 --max-line-length 119 $(check_dirs) setup.py
25 | 
26 | test:
27 | 	pytest -sv --ignore=tests/slow/ tests/
28 | 
29 | slow_test:
30 | 	pytest -sv -vv tests/slow/
31 | 
32 | # Evaluation
33 | 
34 | evaluate:
35 | 	$(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \
36 | 		if [ "$(PARALLEL)" = "data" ]; then \
37 | 			echo "data_parallel_size=$(NUM_GPUS)"; \
38 | 		elif [ "$(PARALLEL)" = "tensor" ]; then \
39 | 			echo "tensor_parallel_size=$(NUM_GPUS)"; \
40 | 		fi \
41 | 	),))
42 | 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
43 | 	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
44 | 	if [ "$(TASK)" = "lcb" ]; then \
45 | 		lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
46 | 			--use-chat-template \
47 | 			--output-dir data/evals/$(MODEL); \
48 | 	else \
49 | 		lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
50 | 			--custom-tasks src/open_r1/evaluate.py \
51 | 			--use-chat-template \
52 | 			--output-dir data/evals/$(MODEL); \
53 | 	fi
54 | 


--------------------------------------------------------------------------------
/open-r1/assets/plan-of-attack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/open-r1/assets/plan-of-attack.png


--------------------------------------------------------------------------------
/open-r1/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml:
--------------------------------------------------------------------------------
 1 | # To start the training, run the following command:
 2 | # sbatch -N 4 --job-name=mistral_sft slurm/train.slurm Mistral-Small-24B-Instruct-2501 sft numina zero3
 3 | 
 4 | model_name_or_path: mistralai/Mistral-Small-24B-Instruct-2501
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | # dataset_name: yentinglin/s1K-1.1-trl-format
11 | dataset_name: yentinglin/OpenR1-Math-220k-trl-format
12 | preprocessing_num_workers: 8
13 | 
14 | # SFT trainer config
15 | bf16: true
16 | do_eval: true
17 | eval_strategy: no
18 | gradient_accumulation_steps: 4
19 | gradient_checkpointing: true
20 | gradient_checkpointing_kwargs:
21 |   use_reentrant: false
22 | hub_model_id: Mistral-Small-24B-Instruct-2501-Open-R1-Distill
23 | hub_strategy: every_save
24 | learning_rate: 2.0e-05
25 | log_level: info
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine
29 | packing: true
30 | max_length: 32768
31 | max_steps: -1
32 | num_train_epochs: 5
33 | output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill
34 | overwrite_output_dir: true
35 | per_device_eval_batch_size: 1
36 | per_device_train_batch_size: 1
37 | push_to_hub: true
38 | report_to:
39 | - wandb
40 | save_strategy: epoch
41 | seed: 42
42 | warmup_ratio: 0.1
43 | 


--------------------------------------------------------------------------------
/open-r1/recipes/OlympicCoder-32B/sft/config_v00.00.yaml:
--------------------------------------------------------------------------------
 1 | # Config for 16 nodes of 8 H100s with FSDP1
 2 | # Model arguments
 3 | model_name_or_path: Qwen/Qwen2.5-Coder-32B-Instruct
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: flash_attention_2
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/codeforces-cots
10 | dataset_config: solutions_decontaminated
11 | dataset_num_proc: 12
12 | 
13 | # SFT trainer config
14 | bf16: true
15 | do_eval: false
16 | eval_strategy: 'no'
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_always_push: true
22 | hub_model_id: OlympicCoder-32B
23 | hub_strategy: every_save
24 | learning_rate: 4.0e-05
25 | log_level: info
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine_with_min_lr
29 | lr_scheduler_kwargs:
30 |   min_lr_rate: 0.1
31 | packing: false
32 | max_grad_norm: 0.2
33 | max_length: 22528 # we were unable to train at 32k due to OOM. See https://github.com/huggingface/transformers/issues/35983 for context parallelism support.
34 | max_steps: -1
35 | num_train_epochs: 10
36 | optim: paged_adamw_8bit
37 | output_dir: data/OlympicCoder-32B
38 | overwrite_output_dir: true
39 | per_device_eval_batch_size: 1
40 | per_device_train_batch_size: 1
41 | push_to_hub: true
42 | report_to:
43 | - wandb
44 | save_only_model: true # needed to bypass FSDP errors with saving paged optimizers
45 | save_strategy: epoch
46 | save_total_limit: 1
47 | seed: 42
48 | use_liger: false # fails on multi-node
49 | warmup_ratio: 0.03


--------------------------------------------------------------------------------
/open-r1/recipes/OlympicCoder-7B/sft/config_v00.00.yaml:
--------------------------------------------------------------------------------
 1 | # Config for 1 node of 8 H100s with DeepSpeed ZeRO-3
 2 | # Model arguments
 3 | model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: flash_attention_2
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/codeforces-cots
10 | dataset_config: solutions_decontaminated
11 | dataset_num_proc: 48
12 | 
13 | # SFT trainer config
14 | bf16: true
15 | do_eval: false
16 | eval_strategy: 'no'
17 | gradient_accumulation_steps: 8
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: open-r1/OlympicCoder-7B
22 | hub_strategy: every_save
23 | learning_rate: 1.0e-05
24 | log_level: info
25 | logging_steps: 1
26 | logging_strategy: steps
27 | lr_scheduler_type: cosine_with_min_lr
28 | lr_scheduler_kwargs:
29 |   min_lr_rate: 0.1
30 | packing: false
31 | max_grad_norm: 0.2
32 | max_length: 32768
33 | max_steps: -1
34 | num_train_epochs: 10
35 | output_dir: data/OlympicCoder-7B
36 | overwrite_output_dir: true
37 | per_device_eval_batch_size: 1
38 | per_device_train_batch_size: 2
39 | push_to_hub: true
40 | report_to:
41 | - wandb
42 | save_strategy: epoch
43 | save_total_limit: 1
44 | seed: 42
45 | use_liger: true
46 | warmup_ratio: 0.03


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v0.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: data/OpenR1-Qwen-7B-SFT-2nodes/checkpoint-1611 #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: true
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 2
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 7
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GPG-7B
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 2
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v0_ds.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: true
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 2
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 7
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GPG-Qwen-7B-DS
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 2
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: data/OpenR1-Qwen-7B-SFT-2nodes/checkpoint-1611 #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: true
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 2
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 7
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GPG-7B-wo-std
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 2
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1
57 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v1_ds.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: true
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 2
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: constant_with_warmup
32 | max_prompt_length: 512
33 | max_completion_length: 4096
34 | num_generations: 6
35 | num_train_epochs: 1
36 | output_dir: data/OpenRS-GPG-Qwen-7B-DS-lr
37 | overwrite_output_dir: true
38 | per_device_eval_batch_size: 6
39 | per_device_train_batch_size: 2
40 | push_to_hub: false
41 | report_to:
42 | - tensorboard
43 | reward_funcs:
44 | - format_v2
45 | - cosine
46 | reward_weights:
47 | - 1.0
48 | - 2.0
49 | save_strategy: "steps"
50 | save_steps: 50
51 | seed: 42
52 | temperature: 0.7
53 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v2_ds.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 1
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: constant_with_warmup
32 | max_prompt_length: 512
33 | max_completion_length: 3584
34 | num_generations: 8
35 | num_train_epochs: 1
36 | output_dir: data/OpenRS-GPG-Qwen-7B-DS-v2
37 | overwrite_output_dir: true
38 | per_device_eval_batch_size: 6
39 | per_device_train_batch_size: 2
40 | push_to_hub: false
41 | report_to:
42 | - tensorboard
43 | reward_funcs:
44 | - format_v2
45 | - accuracy
46 | reward_weights:
47 | - 0.2
48 | - 1.0
49 | save_strategy: "steps"
50 | save_steps: 50
51 | seed: 42
52 | temperature: 0.7
53 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/sft/config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
 4 | model_name_or_path: Qwen/Qwen2.5-Math-7B-Instruct 
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: sdpa
 8 | 
 9 | # Data training arguments
10 | dataset_name: open-r1/OpenR1-Math-220k
11 | dataset_num_proc: 48
12 | 
13 | #SFT hyperparam
14 | max_length: 32768
15 | weight_decay: 0.0001
16 | optim: adamw_torch
17 | lr_scheduler_type: linear
18 | warmup_ratio: 0.1
19 | learning_rate: 5.0e-05
20 | gradient_accumulation_steps: 2
21 | per_device_eval_batch_size: 1
22 | per_device_train_batch_size: 1
23 | 
24 | # SFT trainer config
25 | max_steps: -1
26 | num_train_epochs: 3
27 | bf16: true
28 | do_eval: false
29 | use_liger_kernel: true
30 | eval_strategy: 'no'
31 | gradient_checkpointing: true
32 | gradient_checkpointing_kwargs:
33 |   use_reentrant: false
34 | hub_model_id: OpenR1-Qwen-7B-SFT
35 | hub_strategy: every_save
36 | log_level: info
37 | logging_steps: 5
38 | logging_strategy: steps
39 | packing: true
40 | output_dir: data/OpenR1-Qwen-7B-SFT
41 | overwrite_output_dir: true
42 | push_to_hub: true
43 | report_to:
44 | - wandb
45 | save_strategy: "steps"
46 | save_steps: 500
47 | save_total_limit: 1
48 | seed: 42


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/sft/config_v0.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
 4 | model_name_or_path: models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: sdpa
 8 | 
 9 | # Data training arguments
10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k
11 | dataset_num_proc: 48
12 | 
13 | #SFT hyperparam
14 | max_length: 32768
15 | weight_decay: 0.0001
16 | optim: adamw_torch
17 | lr_scheduler_type: linear
18 | warmup_ratio: 0.1
19 | learning_rate: 5.0e-05
20 | gradient_accumulation_steps: 1
21 | per_device_eval_batch_size: 1
22 | per_device_train_batch_size: 1
23 | 
24 | # SFT trainer config
25 | max_steps: -1
26 | num_train_epochs: 3
27 | bf16: true
28 | do_eval: false
29 | use_liger: false
30 | use_liger_kernel: false
31 | eval_strategy: 'no'
32 | gradient_checkpointing: true
33 | gradient_checkpointing_kwargs:
34 |   use_reentrant: false
35 | hub_model_id: OpenR1-Qwen-7B-SFT
36 | hub_strategy: every_save
37 | log_level: info
38 | logging_steps: 5
39 | logging_strategy: steps
40 | packing: true
41 | output_dir: data/OpenR1-Qwen-7B-SFT
42 | overwrite_output_dir: true
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | save_strategy: "steps"
47 | save_steps: 500
48 | save_total_limit: 3
49 | seed: 42


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/sft/config_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
 4 | model_name_or_path: models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: sdpa
 8 | 
 9 | # Data training arguments
10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k
11 | dataset_num_proc: 48
12 | 
13 | #SFT hyperparam
14 | max_length: 32768
15 | weight_decay: 0.0001
16 | optim: adamw_torch
17 | lr_scheduler_type: linear
18 | warmup_ratio: 0.1
19 | learning_rate: 1e-04
20 | gradient_accumulation_steps: 1
21 | per_device_eval_batch_size: 1
22 | per_device_train_batch_size: 1
23 | 
24 | # SFT trainer config
25 | max_steps: -1
26 | num_train_epochs: 3
27 | bf16: true
28 | do_eval: false
29 | use_liger: false
30 | use_liger_kernel: false
31 | eval_strategy: 'no'
32 | gradient_checkpointing: true
33 | gradient_checkpointing_kwargs:
34 |   use_reentrant: false
35 | hub_model_id: OpenR1-Qwen-7B-SFT
36 | hub_strategy: every_save
37 | log_level: info
38 | logging_steps: 5
39 | logging_strategy: steps
40 | packing: true
41 | output_dir: data/OpenR1-Qwen-7B-SFT-2nodes
42 | overwrite_output_dir: true
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | save_strategy: "steps"
47 | save_steps: 500
48 | save_total_limit: 3
49 | seed: 42


--------------------------------------------------------------------------------
/open-r1/recipes/OpenR1-Qwen-7B/sft/config_v2.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
 4 | model_name_or_path: models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: sdpa
 8 | 
 9 | # Data training arguments
10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k
11 | dataset_num_proc: 48
12 | 
13 | #SFT hyperparam
14 | max_length: 32768
15 | weight_decay: 0.0001
16 | optim: adamw_torch
17 | lr_scheduler_type: linear
18 | warmup_ratio: 0.1
19 | learning_rate: 5e-05
20 | gradient_accumulation_steps: 1
21 | per_device_eval_batch_size: 1
22 | per_device_train_batch_size: 1
23 | 
24 | # SFT trainer config
25 | max_steps: -1
26 | num_train_epochs: 3
27 | bf16: true
28 | do_eval: false
29 | use_liger: false
30 | use_liger_kernel: false
31 | eval_strategy: 'no'
32 | gradient_checkpointing: true
33 | gradient_checkpointing_kwargs:
34 |   use_reentrant: false
35 | hub_model_id: OpenR1-Qwen-7B-SFT
36 | hub_strategy: every_save
37 | log_level: info
38 | logging_steps: 5
39 | logging_strategy: steps
40 | packing: true
41 | output_dir: data/OpenR1-Qwen-7B-SFT-1nodes
42 | overwrite_output_dir: true
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | save_strategy: "steps"
47 | save_steps: 500
48 | save_total_limit: 6
49 | seed: 42


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 2.0e-05
24 | log_completions: false
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: cosine
30 | max_prompt_length: 512
31 | max_completion_length: 1024
32 | max_steps: -1
33 | # change gen to 8, so that we can train faster.
34 | num_generations: 8
35 | num_train_epochs: 1
36 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-WSTD
37 | overwrite_output_dir: true
38 | per_device_eval_batch_size: 16
39 | per_device_train_batch_size: 16
40 | push_to_hub: false
41 | report_to:
42 | - tensorboard
43 | reward_funcs:
44 | - accuracy
45 | - format
46 | - tag_count
47 | reward_weights:
48 | - 1.0
49 | - 1.0
50 | - 1.0
51 | #save_strategy: "epoch"
52 | #save_total_limit: 1
53 | save_strategy: "steps"
54 | save_steps: 200
55 | save_total_limit: 7
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v2.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 2.0e-05
24 | log_completions: false
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: cosine
30 | max_prompt_length: 512
31 | max_completion_length: 1024
32 | max_steps: -1
33 | # change gen to 8, so that we can train faster.
34 | num_generations: 8
35 | num_train_epochs: 1
36 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-a1f1
37 | overwrite_output_dir: true
38 | per_device_eval_batch_size: 16
39 | per_device_train_batch_size: 16
40 | push_to_hub: false
41 | report_to:
42 | - tensorboard
43 | reward_funcs:
44 | - accuracy
45 | - format
46 | reward_weights:
47 | - 1.0
48 | - 1.0
49 | #save_strategy: "epoch"
50 | #save_total_limit: 1
51 | save_strategy: "steps"
52 | save_steps: 200
53 | save_total_limit: 7
54 | seed: 42
55 | warmup_ratio: 0.1
56 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v3.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 2.0e-05
24 | log_completions: false
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: cosine
30 | max_prompt_length: 512
31 | max_completion_length: 1024
32 | max_steps: -1
33 | # change gen to 8, so that we can train faster.
34 | num_generations: 8
35 | num_train_epochs: 1
36 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v3
37 | overwrite_output_dir: true
38 | remove_unused_columns: False # avoid failure of parsing gold.
39 | per_device_eval_batch_size: 16
40 | per_device_train_batch_size: 16
41 | push_to_hub: false
42 | report_to:
43 | - tensorboard
44 | reward_funcs:
45 | - accuracy
46 | - format
47 | reward_weights:
48 | - 1.0
49 | - 0.2
50 | #save_strategy: "epoch"
51 | #save_total_limit: 1
52 | save_strategy: "steps"
53 | save_steps: 200
54 | save_total_limit: 7
55 | seed: 42
56 | warmup_ratio: 0.1
57 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v4.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 2
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 1.0e-06
24 | log_completions: false
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: constant_with_warmup
30 | max_grad_norm: 0.2
31 | max_prompt_length: 512
32 | max_completion_length: 2048 #1024
33 | max_steps: -1
34 | # change gen to 8, so that we can train faster.
35 | num_generations: 8
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v4-increase-2048
38 | overwrite_output_dir: true
39 | remove_unused_columns: False # avoid failure of parsing gold.
40 | per_device_eval_batch_size: 16
41 | per_device_train_batch_size: 8
42 | push_to_hub: false
43 | report_to:
44 | - tensorboard
45 | reward_funcs:
46 | - accuracy
47 | - format
48 | reward_weights:
49 | - 1.0
50 | - 0.2
51 | #save_strategy: "epoch"
52 | #save_total_limit: 1
53 | save_strategy: "steps"
54 | save_steps: 100
55 | save_total_limit: 10
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v5.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 1.0e-06
24 | log_completions: false
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: constant_with_warmup
30 | max_grad_norm: 0.2
31 | max_prompt_length: 512
32 | max_completion_length: 1024
33 | max_steps: -1
34 | # change gen to 8, so that we can train faster.
35 | num_generations: 8
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v5
38 | overwrite_output_dir: true
39 | remove_unused_columns: False # avoid failure of parsing gold.
40 | per_device_eval_batch_size: 16
41 | per_device_train_batch_size: 16
42 | push_to_hub: false
43 | report_to:
44 | - tensorboard
45 | reward_funcs:
46 | - accuracy
47 | - format
48 | reward_weights:
49 | - 1.0
50 | - 0.1
51 | #save_strategy: "epoch"
52 | #save_total_limit: 1
53 | save_strategy: "steps"
54 | save_steps: 100
55 | save_total_limit: 10
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_woSTD.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 1.0e-06
24 | log_completions: false
25 | log_level: info
26 | logging_first_step: true
27 | logging_steps: 1
28 | logging_strategy: steps
29 | lr_scheduler_type: constant_with_warmup
30 | max_grad_norm: 0.2
31 | max_prompt_length: 512
32 | max_completion_length: 1024
33 | max_steps: -1
34 | # change gen to 8, so that we can train faster.
35 | num_generations: 8
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v5-WoSTD
38 | overwrite_output_dir: true
39 | remove_unused_columns: False # avoid failure of parsing gold.
40 | per_device_eval_batch_size: 16
41 | per_device_train_batch_size: 16
42 | push_to_hub: false
43 | report_to:
44 | - tensorboard
45 | reward_funcs:
46 | - accuracy
47 | - format
48 | reward_weights:
49 | - 1.0
50 | - 0.1
51 | #save_strategy: "epoch"
52 | #save_total_limit: 1
53 | save_strategy: "steps"
54 | save_steps: 100
55 | save_total_limit: 10
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_v0.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 1
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/Qwen2.5-1.5B-OpenRS-GPG-v2
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format_v2
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_v0_nostd.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 1
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/Qwen2.5-1.5B-OpenRS-GPG-wostd
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1
57 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_v0_open22k.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/ #knoveleng/open-rs
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 1
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: false
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/Qwen2.5-1.5B-OpenRS-GPG-open220k
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-220k
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: true
14 | do_eval: false
15 | gradient_accumulation_steps: 4
16 | gradient_checkpointing: true
17 | gradient_checkpointing_kwargs:
18 |   use_reentrant: false
19 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
20 | hub_strategy: every_save
21 | learning_rate: 2.0e-05
22 | log_completions: true
23 | log_level: info
24 | logging_first_step: true
25 | logging_steps: 1
26 | logging_strategy: steps
27 | lr_scheduler_type: cosine
28 | max_prompt_length: 512
29 | max_completion_length: 1024
30 | max_steps: -1
31 | num_generations: 16
32 | num_train_epochs: 1
33 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
34 | overwrite_output_dir: true
35 | per_device_eval_batch_size: 16
36 | per_device_train_batch_size: 16
37 | push_to_hub: true
38 | report_to:
39 | - wandb
40 | reward_funcs:
41 | - accuracy
42 | - format
43 | - tag_count
44 | reward_weights:
45 | - 1.0
46 | - 1.0
47 | - 1.0
48 | save_strategy: "epoch"
49 | save_total_limit: 1
50 | seed: 42
51 | warmup_ratio: 0.1
52 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/verifiable-coding-problems-python
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | beta: 0.01
13 | bf16: true
14 | use_vllm: true
15 | do_eval: false
16 | gradient_accumulation_steps: 4
17 | gradient_checkpointing: true
18 | gradient_checkpointing_kwargs:
19 |   use_reentrant: false
20 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
21 | hub_strategy: every_save
22 | learning_rate: 5.0e-06
23 | log_completions: true
24 | log_level: info
25 | logging_first_step: true
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine_with_min_lr
29 | lr_scheduler_kwargs:
30 |   min_lr_rate: 0.1
31 | max_prompt_length: 1024
32 | max_completion_length: 2048
33 | max_steps: 500
34 | num_generations: 14
35 | num_train_epochs: 1
36 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
37 | overwrite_output_dir: true
38 | per_device_train_batch_size: 16
39 | push_to_hub: true
40 | report_to:
41 | - wandb
42 | reward_funcs:
43 | - code
44 | - format
45 | reward_weights:
46 | - 1.0
47 | - 0.1
48 | save_strategy: "steps"
49 | save_steps: 50
50 | save_total_limit: 1
51 | seed: 42
52 | temperature: 1.0
53 | warmup_ratio: 0.03


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/ioi
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | beta: 0.01
13 | bf16: true
14 | use_vllm: true
15 | do_eval: false
16 | gradient_accumulation_steps: 4
17 | gradient_checkpointing: true
18 | gradient_checkpointing_kwargs:
19 |   use_reentrant: false
20 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
21 | hub_strategy: every_save
22 | learning_rate: 5.0e-06
23 | log_completions: true
24 | log_level: info
25 | logging_first_step: true
26 | logging_steps: 1
27 | logging_strategy: steps
28 | lr_scheduler_type: cosine_with_min_lr
29 | lr_scheduler_kwargs:
30 |   min_lr_rate: 0.1
31 | max_prompt_length: 1024
32 | max_completion_length: 2048
33 | max_steps: 500
34 | num_generations: 14
35 | num_train_epochs: 1
36 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
37 | overwrite_output_dir: true
38 | per_device_train_batch_size: 16
39 | push_to_hub: true
40 | report_to:
41 | - wandb
42 | save_strategy: "steps"
43 | save_steps: 50
44 | save_total_limit: 1
45 | seed: 42
46 | temperature: 1.0
47 | warmup_ratio: 0.03
48 | # ioi specific config
49 | code_language: cpp
50 | reward_funcs:
51 | - ioi_code
52 | - code_format
53 | - format
54 | reward_weights:
55 | - 1.0
56 | - 0.1
57 | - 0.1
58 | # for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
59 | # otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
60 | code_eval_test_batch_size: 3


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 2.0e-05
24 | #disable log complete
25 | log_completions: false
26 | log_level: info
27 | logging_first_step: true
28 | logging_steps: 1
29 | logging_strategy: steps
30 | lr_scheduler_type: cosine
31 | max_prompt_length: 512
32 | max_completion_length: 1024
33 | max_steps: -1
34 | # change gen to 8, so that we can train faster.
35 | num_generations: 8
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
38 | overwrite_output_dir: true
39 | per_device_eval_batch_size: 16
40 | per_device_train_batch_size: 16
41 | push_to_hub: false
42 | report_to:
43 | - tensorboard
44 | reward_funcs:
45 | - accuracy
46 | - format
47 | - tag_count
48 | reward_weights:
49 | - 1.0
50 | - 1.0
51 | - 1.0
52 | #save_strategy: "epoch"
53 | #save_total_limit: 1
54 | save_strategy: "steps"
55 | save_steps: 200
56 | save_total_limit: 7
57 | seed: 42
58 | warmup_ratio: 0.1
59 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_v3.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_gpu_memory_utilization: 0.7
16 | do_eval: false
17 | gradient_accumulation_steps: 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
22 | hub_strategy: every_save
23 | learning_rate: 2.0e-05
24 | #disable log complete
25 | log_completions: false
26 | log_level: info
27 | logging_first_step: true
28 | logging_steps: 1
29 | logging_strategy: steps
30 | lr_scheduler_type: cosine
31 | max_prompt_length: 512
32 | max_completion_length: 1024
33 | max_steps: -1
34 | # change gen to 8, so that we can train faster.
35 | num_generations: 8
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO-v3
38 | overwrite_output_dir: true
39 | remove_unused_columns: False # avoid failure of parsing gold.
40 | per_device_eval_batch_size: 16
41 | per_device_train_batch_size: 16
42 | push_to_hub: false
43 | report_to:
44 | - tensorboard
45 | reward_funcs:
46 | - accuracy
47 | - format
48 | reward_weights:
49 | - 1.0
50 | - 0.2
51 | #save_strategy: "epoch"
52 | #save_total_limit: 1
53 | save_strategy: "steps"
54 | save_steps: 200
55 | save_total_limit: 7
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-220k
 9 | dataset_num_proc: 48
10 | 
11 | # SFT trainer config
12 | bf16: true
13 | do_eval: false
14 | eval_strategy: 'no'
15 | gradient_accumulation_steps: 1
16 | gradient_checkpointing: true
17 | gradient_checkpointing_kwargs:
18 |   use_reentrant: false
19 | hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
20 | hub_strategy: every_save
21 | learning_rate: 5.0e-05
22 | log_level: info
23 | logging_steps: 5
24 | logging_strategy: steps
25 | lr_scheduler_type: cosine_with_min_lr
26 | lr_scheduler_kwargs:
27 |   min_lr_rate: 0.1
28 | packing: true
29 | max_length: 16384
30 | max_steps: -1
31 | num_train_epochs: 1
32 | output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
33 | overwrite_output_dir: true
34 | per_device_eval_batch_size: 16
35 | per_device_train_batch_size: 16
36 | push_to_hub: true
37 | report_to:
38 | - wandb
39 | save_strategy: "steps"
40 | save_steps: 100
41 | save_total_limit: 1
42 | seed: 42
43 | use_liger: true
44 | warmup_ratio: 0.05


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/OpenR1-Math-220k/
 9 | dataset_num_proc: 48
10 | 
11 | # SFT trainer config
12 | bf16: true
13 | do_eval: false
14 | eval_strategy: 'no'
15 | #eval_steps: 200
16 | 
17 | gradient_accumulation_steps: 2 # 1
18 | gradient_checkpointing: true
19 | gradient_checkpointing_kwargs:
20 |   use_reentrant: false
21 | hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
22 | hub_strategy: every_save
23 | learning_rate: 5.0e-05
24 | log_level: info
25 | logging_steps: 5
26 | logging_strategy: steps
27 | lr_scheduler_type: cosine_with_min_lr
28 | lr_scheduler_kwargs:
29 |   min_lr_rate: 0.1
30 | packing: true
31 | max_length: 16384
32 | max_steps: -1
33 | num_train_epochs: 1
34 | output_dir: data/Qwen2.5-1.5B-Open-SFT
35 | overwrite_output_dir: true
36 | per_device_eval_batch_size: 8 #16
37 | per_device_train_batch_size: 2 # default 16
38 | push_to_hub: false
39 | report_to:
40 | - tensorboard
41 | save_strategy: "steps"
42 | save_steps: 100
43 | save_total_limit: 5
44 | seed: 42
45 | use_liger: false #true
46 | warmup_ratio: 0.05


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
 4 | model_name_or_path: models/Qwen2.5-1.5B-Instruct #models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k
11 | dataset_num_proc: 48
12 | 
13 | #SFT hyperparam
14 | max_length: 32768
15 | weight_decay: 0.0001
16 | optim: adamw_torch
17 | lr_scheduler_type: linear
18 | warmup_ratio: 0.1
19 | learning_rate: 1e-04
20 | gradient_accumulation_steps: 1
21 | per_device_eval_batch_size: 1
22 | per_device_train_batch_size: 1
23 | 
24 | # SFT trainer config
25 | max_steps: -1
26 | num_train_epochs: 3
27 | bf16: true
28 | do_eval: false
29 | use_liger: false
30 | use_liger_kernel: false
31 | eval_strategy: 'no'
32 | gradient_checkpointing: true
33 | gradient_checkpointing_kwargs:
34 |   use_reentrant: false
35 | hub_model_id: OpenR1-Qwen-1.5B-SFT
36 | hub_strategy: every_save
37 | log_level: info
38 | logging_steps: 5
39 | logging_strategy: steps
40 | packing: true
41 | output_dir: data/OpenR1-Qwen-1.5B-SFT
42 | overwrite_output_dir: true
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | save_strategy: "steps"
47 | save_steps: 500
48 | save_total_limit: 4
49 | seed: 42


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-7B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-cn_k12-86k
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | beta: 0.001
13 | bf16: true
14 | do_eval: false
15 | eval_strategy: "no"
16 | use_vllm: true
17 | do_eval: false
18 | gradient_accumulation_steps: 16
19 | gradient_checkpointing: true
20 | gradient_checkpointing_kwargs:
21 |   use_reentrant: false
22 | hub_model_id: Qwen2.5-7B-Instruct-GRPO
23 | hub_strategy: every_save
24 | learning_rate: 1.0e-06
25 | log_completions: true
26 | log_level: info
27 | logging_first_step: true
28 | logging_steps: 1
29 | logging_strategy: steps
30 | lr_scheduler_type: constant_with_warmup
31 | max_grad_norm: 0.2
32 | max_prompt_length: 1024
33 | max_completion_length: 4096
34 | max_steps: -1
35 | num_generations: 16
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-7B-Instruct-GRPO
38 | overwrite_output_dir: true
39 | per_device_train_batch_size: 4
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | reward_funcs:
44 | - accuracy
45 | - format
46 | reward_weights:
47 | - 1.0
48 | - 0.2
49 | save_strategy: "steps"
50 | save_steps: 0.1
51 | save_total_limit: 1
52 | seed: 42
53 | temperature: 0.7
54 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-7B-Instruct/grpo/config_demo_v1.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: Qwen/Qwen2.5-7B-Instruct
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: open-r1/OpenR1-Math-cn_k12-86k
 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
10 | 
11 | # GRPO trainer config
12 | beta: 0.001
13 | bf16: true
14 | do_eval: false
15 | eval_strategy: "no"
16 | use_vllm: true
17 | do_eval: false
18 | gradient_accumulation_steps: 16
19 | gradient_checkpointing: true
20 | gradient_checkpointing_kwargs:
21 |   use_reentrant: false
22 | hub_model_id: Qwen2.5-7B-Instruct-GRPO
23 | hub_strategy: every_save
24 | learning_rate: 1.0e-06
25 | log_completions: true
26 | log_level: info
27 | logging_first_step: true
28 | logging_steps: 1
29 | logging_strategy: steps
30 | lr_scheduler_type: constant_with_warmup
31 | max_grad_norm: 0.2
32 | max_prompt_length: 1024
33 | max_completion_length: 4096
34 | max_steps: -1
35 | num_generations: 16
36 | num_train_epochs: 1
37 | output_dir: data/Qwen2.5-7B-Instruct-GRPO
38 | overwrite_output_dir: true
39 | per_device_train_batch_size: 4
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | reward_funcs:
44 | - accuracy
45 | - format
46 | reward_weights:
47 | - 1.0
48 | - 0.2
49 | save_strategy: "steps"
50 | save_steps: 0.1
51 | save_total_limit: 1
52 | seed: 42
53 | temperature: 0.7
54 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | 
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | bf16: true
17 | use_vllm: false
18 | do_eval: true
19 | eval_strategy: steps
20 | eval_steps: 100
21 | gradient_accumulation_steps: 4
22 | gradient_checkpointing: true
23 | gradient_checkpointing_kwargs:
24 |   use_reentrant: false
25 | hub_model_id: Qwen-2.5-7B-Simple-RL
26 | hub_strategy: every_save
27 | learning_rate: 3.0e-06
28 | log_completions: false
29 | log_level: info
30 | logging_first_step: true
31 | logging_steps: 5
32 | logging_strategy: steps
33 | lr_scheduler_type: cosine
34 | max_prompt_length: 512
35 | max_completion_length: 1024
36 | max_steps: -1
37 | num_generations: 8
38 | num_train_epochs: 1
39 | output_dir: data/Qwen-2.5-7B-Simple-RL-TRPO
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 16
42 | per_device_train_batch_size: 16
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - accuracy
48 | - format
49 | reward_weights:
50 | - 1.0
51 | - 1.0
52 | save_strategy: "no"
53 | seed: 42
54 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_dgrpo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | 
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | do_eval: true
20 | eval_strategy: steps
21 | eval_steps: 100
22 | gradient_accumulation_steps: 4
23 | gradient_checkpointing: true
24 | gradient_checkpointing_kwargs:
25 |   use_reentrant: false
26 | hub_model_id: Qwen-2.5-7B-Simple-RL
27 | hub_strategy: every_save
28 | learning_rate: 3.0e-06
29 | log_completions: false
30 | log_level: info
31 | logging_first_step: true
32 | logging_steps: 5
33 | logging_strategy: steps
34 | lr_scheduler_type: cosine
35 | max_prompt_length: 512
36 | max_completion_length: 1024
37 | max_steps: -1
38 | num_generations: 8
39 | num_train_epochs: 1
40 | output_dir: data/Qwen-2.5-7B-Simple-RL-DrTRPO
41 | overwrite_output_dir: true
42 | per_device_eval_batch_size: 16
43 | per_device_train_batch_size: 16
44 | push_to_hub: false
45 | report_to:
46 | - tensorboard
47 | reward_funcs:
48 | - accuracy
49 | - format
50 | reward_weights:
51 | - 1.0
52 | - 1.0
53 | save_strategy: "no"
54 | seed: 42
55 | warmup_ratio: 0.1
56 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | #system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please reason step by step, and put your final answer within \\boxed{}."
15 | 
16 | #"<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n"
17 | # GRPO trainer config
18 | beta: 0.0
19 | bf16: true
20 | use_vllm: false
21 | vllm_device: auto
22 | vllm_gpu_memory_utilization: 0.7
23 | do_eval: true
24 | eval_strategy: epoch # steps
25 | #eval_steps: 100
26 | gradient_accumulation_steps: 4
27 | gradient_checkpointing: true
28 | gradient_checkpointing_kwargs:
29 |   use_reentrant: false
30 | hub_model_id: Qwen-2.5-7B-Simple-RL
31 | hub_strategy: every_save
32 | learning_rate: 2.0e-06
33 | log_completions: false
34 | log_level: info
35 | logging_first_step: true
36 | logging_steps: 1
37 | logging_strategy: steps
38 | lr_scheduler_type: constant_with_warmup
39 | max_prompt_length: 512
40 | max_completion_length: 1024
41 | max_steps: -1
42 | num_generations: 8
43 | num_train_epochs: 1
44 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-E1
45 | overwrite_output_dir: true
46 | per_device_eval_batch_size: 16
47 | per_device_train_batch_size: 16
48 | push_to_hub: false
49 | report_to:
50 | - tensorboard
51 | reward_funcs:
52 | - accuracy
53 | reward_weights:
54 | - 1.0
55 | save_strategy: epoch
56 | save_total_limit: 1
57 | seed: 42
58 | warmup_ratio: 0.1
59 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_3k.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 3804
39 | max_steps: -1
40 | num_generations: 8
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-3804
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 2
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_3k_2nodes.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 2
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 3804
39 | max_steps: -1
40 | num_generations: 8
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-3804-2nodes
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 2
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n16.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 16
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n16
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n16_wostd.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 16
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n16-wostd
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n2.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 2
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n2
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n2_wostd.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 2
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n2-wostd
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n4.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 4
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n4
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n4_wostd.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 4
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n4-wostd
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n8.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 8
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n8
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | reward_weights:
52 | - 1.0
53 | save_strategy: "epoch"
54 | seed: 42
55 | warmup_ratio: 0.1
56 | 
57 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_scale_batch.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 8
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-scale-batch
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | scale_batch: true
59 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
12 | dataset_config: "train.parquet"
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | # GRPO trainer config
15 | beta: 0.0
16 | bf16: true
17 | use_vllm: false
18 | vllm_device: auto
19 | vllm_gpu_memory_utilization: 0.7
20 | do_eval: true
21 | eval_strategy: steps
22 | eval_steps: 100
23 | gradient_accumulation_steps: 4
24 | gradient_checkpointing: true
25 | gradient_checkpointing_kwargs:
26 |   use_reentrant: false
27 | hub_model_id: Qwen-2.5-7B-Simple-RL
28 | hub_strategy: every_save
29 | learning_rate: 3.0e-06
30 | log_completions: false
31 | log_level: info
32 | logging_first_step: true
33 | logging_steps: 5
34 | logging_strategy: steps
35 | lr_scheduler_type: cosine
36 | max_prompt_length: 512
37 | max_completion_length: 3000
38 | max_steps: -1
39 | num_generations: 8
40 | num_train_epochs: 1
41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35
42 | overwrite_output_dir: true
43 | per_device_eval_batch_size: 16
44 | per_device_train_batch_size: 16
45 | push_to_hub: false
46 | report_to:
47 | - tensorboard
48 | reward_funcs:
49 | - accuracy_lv35
50 | - format
51 | reward_weights:
52 | - 1.0
53 | - 1.0
54 | save_strategy: "no"
55 | seed: 42
56 | warmup_ratio: 0.1
57 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
12 | dataset_config: "train.parquet"
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | # GRPO trainer config
15 | beta: 0.0
16 | bf16: true
17 | use_vllm: false
18 | vllm_device: auto
19 | vllm_gpu_memory_utilization: 0.7
20 | do_eval: true
21 | eval_strategy: steps
22 | eval_steps: 33
23 | gradient_accumulation_steps: 1
24 | gradient_checkpointing: true
25 | gradient_checkpointing_kwargs:
26 |   use_reentrant: false
27 | hub_model_id: Qwen-2.5-7B-Simple-RL
28 | hub_strategy: every_save
29 | learning_rate: 1.0e-06
30 | log_completions: false
31 | log_level: info
32 | logging_first_step: true
33 | logging_steps: 1
34 | logging_strategy: steps
35 | lr_scheduler_type: constant_with_warmup
36 | max_prompt_length: 1024
37 | max_completion_length: 3000
38 | max_steps: -1
39 | num_generations: 8
40 | num_train_epochs: 3
41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
42 | overwrite_output_dir: true
43 | per_device_eval_batch_size: 16
44 | per_device_train_batch_size: 8
45 | push_to_hub: false
46 | report_to:
47 | - tensorboard
48 | reward_funcs:
49 | - accuracy_lv35
50 | reward_weights:
51 | - 1.0
52 | save_strategy: "steps"
53 | save_steps: 33
54 | seed: 42
55 | warmup_ratio: 0.03
56 | temperature: 1.0
57 | top_p : 1.0
58 | 
59 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
12 | dataset_config: "train.parquet"
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | # GRPO trainer config
15 | beta: 0.0
16 | bf16: true
17 | use_vllm: false
18 | vllm_device: auto
19 | vllm_gpu_memory_utilization: 0.7
20 | do_eval: true
21 | eval_strategy: steps
22 | eval_steps: 100
23 | gradient_accumulation_steps: 1
24 | gradient_checkpointing: true
25 | gradient_checkpointing_kwargs:
26 |   use_reentrant: false
27 | hub_model_id: Qwen-2.5-7B-Simple-RL
28 | hub_strategy: every_save
29 | learning_rate: 1.0e-06
30 | log_completions: false
31 | log_level: info
32 | logging_first_step: true
33 | logging_steps: 5
34 | logging_strategy: steps
35 | lr_scheduler_type: constant_with_warmup
36 | max_prompt_length: 1024
37 | max_completion_length: 3000
38 | max_steps: -1
39 | num_generations: 8
40 | num_train_epochs: 1
41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2
42 | overwrite_output_dir: true
43 | per_device_eval_batch_size: 16
44 | per_device_train_batch_size: 4
45 | push_to_hub: false
46 | report_to:
47 | - tensorboard
48 | reward_funcs:
49 | - accuracy_lv35
50 | - format
51 | reward_weights:
52 | - 1.0
53 | - 1.0
54 | save_strategy: "no"
55 | seed: 42
56 | warmup_ratio: 0.03
57 | temperature: 1.0
58 | top_p : 1.0
59 | scale_rewards: false
60 | 
61 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_g16.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
12 | dataset_config: "train.parquet"
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | # GRPO trainer config
15 | beta: 0.0
16 | bf16: true
17 | use_vllm: false
18 | vllm_device: auto
19 | vllm_gpu_memory_utilization: 0.7
20 | do_eval: true
21 | eval_strategy: steps
22 | eval_steps: 100
23 | gradient_accumulation_steps: 1
24 | gradient_checkpointing: true
25 | gradient_checkpointing_kwargs:
26 |   use_reentrant: false
27 | hub_model_id: Qwen-2.5-7B-Simple-RL
28 | hub_strategy: every_save
29 | learning_rate: 1.0e-06
30 | log_completions: false
31 | log_level: info
32 | logging_first_step: true
33 | logging_steps: 5
34 | logging_strategy: steps
35 | lr_scheduler_type: constant_with_warmup
36 | max_prompt_length: 1024
37 | max_completion_length: 3000
38 | max_steps: -1
39 | num_generations: 16
40 | num_train_epochs: 1
41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2-g16
42 | overwrite_output_dir: true
43 | per_device_eval_batch_size: 16
44 | per_device_train_batch_size: 8
45 | push_to_hub: false
46 | report_to:
47 | - tensorboard
48 | reward_funcs:
49 | - accuracy_lv35
50 | - format
51 | reward_weights:
52 | - 1.0
53 | - 1.0
54 | save_strategy: "no"
55 | seed: 42
56 | warmup_ratio: 0.03
57 | temperature: 1.0
58 | top_p : 1.0
59 | scale_rewards: false
60 | 
61 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_v1.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 8
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_v1_kl.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | bf16: true
17 | use_vllm: false
18 | vllm_device: auto
19 | vllm_gpu_memory_utilization: 0.7
20 | do_eval: true
21 | eval_strategy: steps
22 | eval_steps: 100
23 | gradient_accumulation_steps: 4
24 | gradient_checkpointing: true
25 | gradient_checkpointing_kwargs:
26 |   use_reentrant: false
27 | hub_model_id: Qwen-2.5-7B-Simple-RL
28 | hub_strategy: every_save
29 | learning_rate: 3.0e-06
30 | log_completions: false
31 | log_level: info
32 | logging_first_step: true
33 | logging_steps: 5
34 | logging_strategy: steps
35 | lr_scheduler_type: cosine
36 | max_prompt_length: 512
37 | max_completion_length: 1024
38 | max_steps: -1
39 | num_generations: 8
40 | num_train_epochs: 1
41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-beta04
42 | overwrite_output_dir: true
43 | per_device_eval_batch_size: 16
44 | per_device_train_batch_size: 16
45 | push_to_hub: false
46 | report_to:
47 | - tensorboard
48 | reward_funcs:
49 | - accuracy
50 | - format
51 | reward_weights:
52 | - 1.0
53 | - 1.0
54 | save_strategy: "no"
55 | seed: 42
56 | warmup_ratio: 0.1
57 | 


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_v1_nostd.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Model arguments
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | beta: 0.0
17 | bf16: true
18 | use_vllm: false
19 | vllm_device: auto
20 | vllm_gpu_memory_utilization: 0.7
21 | do_eval: true
22 | eval_strategy: steps
23 | eval_steps: 100
24 | gradient_accumulation_steps: 4
25 | gradient_checkpointing: true
26 | gradient_checkpointing_kwargs:
27 |   use_reentrant: false
28 | hub_model_id: Qwen-2.5-7B-Simple-RL
29 | hub_strategy: every_save
30 | learning_rate: 3.0e-06
31 | log_completions: false
32 | log_level: info
33 | logging_first_step: true
34 | logging_steps: 5
35 | logging_strategy: steps
36 | lr_scheduler_type: cosine
37 | max_prompt_length: 512
38 | max_completion_length: 1024
39 | max_steps: -1
40 | num_generations: 8
41 | num_train_epochs: 1
42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-wostd
43 | overwrite_output_dir: true
44 | per_device_eval_batch_size: 16
45 | per_device_train_batch_size: 16
46 | push_to_hub: false
47 | report_to:
48 | - tensorboard
49 | reward_funcs:
50 | - accuracy
51 | - format
52 | reward_weights:
53 | - 1.0
54 | - 1.0
55 | save_strategy: "no"
56 | seed: 42
57 | warmup_ratio: 0.1
58 | scale_rewards: false


--------------------------------------------------------------------------------
/open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_wokl.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | #model_name_or_path: Qwen/Qwen2.5-Math-7B
 3 | model_name_or_path: models/Qwen2.5-Math-7B
 4 | 
 5 | model_revision: main
 6 | torch_dtype: bfloat16
 7 | attn_implementation: flash_attention_2
 8 | 
 9 | # Data training arguments
10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval
11 | dataset_name: "datas/MATH-lighteval"
12 | dataset_config: default
13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
14 | 
15 | # GRPO trainer config
16 | bf16: true
17 | use_vllm: false
18 | do_eval: true
19 | eval_strategy: steps
20 | eval_steps: 100
21 | gradient_accumulation_steps: 2
22 | gradient_checkpointing: true
23 | gradient_checkpointing_kwargs:
24 |   use_reentrant: false
25 | hub_model_id: Qwen-2.5-7B-Simple-RL
26 | hub_strategy: every_save
27 | learning_rate: 3.0e-06
28 | log_completions: false
29 | log_level: info
30 | logging_first_step: true
31 | logging_steps: 5
32 | logging_strategy: steps
33 | lr_scheduler_type: cosine
34 | max_prompt_length: 512
35 | max_completion_length: 1024
36 | max_steps: -1
37 | num_generations: 8
38 | num_train_epochs: 1
39 | output_dir: data/Qwen-2.5-7B-Simple-RL-TRPO-wokl
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 16
42 | per_device_train_batch_size: 16
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - accuracy
48 | - format
49 | reward_weights:
50 | - 1.0
51 | - 1.0
52 | save_strategy: "no"
53 | seed: 42
54 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-r1/recipes/README.md:
--------------------------------------------------------------------------------
 1 | # Post-training recipes
 2 | 
 3 | ## OlympicCoder
 4 | 
 5 | To train the OlympicCoder models, run:
 6 | 
 7 | ```
 8 | # 7B
 9 | sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3
10 | 
11 | # 32B
12 | sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp
13 | ```
14 | 
15 | Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size.


--------------------------------------------------------------------------------
/open-r1/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: sdpa
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/OpenR1-Math-220k
10 | dataset_num_proc: 48
11 | 
12 | #SFT hyperparam
13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
14 | weight_decay: 0.0001
15 | optim: adamw_torch
16 | lr_scheduler_type: linear
17 | warmup_ratio: 0.1
18 | learning_rate: 5.0e-05
19 | gradient_accumulation_steps: 2
20 | per_device_eval_batch_size: 4
21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
22 | 
23 | # SFT trainer config
24 | max_steps: -1
25 | num_train_epochs: 3
26 | bf16: true
27 | do_eval: false
28 | eval_strategy: 'no'
29 | gradient_checkpointing: true
30 | gradient_checkpointing_kwargs:
31 |   use_reentrant: false
32 | hub_model_id: OpenR1-Qwen-7B-SFT
33 | hub_strategy: every_save
34 | log_level: info
35 | logging_steps: 5
36 | logging_strategy: steps
37 | packing: true
38 | output_dir: data/OpenR1-Qwen-7B-SFT
39 | overwrite_output_dir: true
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | save_strategy: "steps"
44 | save_steps: 500
45 | save_total_limit: 1
46 | seed: 42
47 | 


--------------------------------------------------------------------------------
/open-r1/recipes/SmolLM2-1.7B/sft/config.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: sdpa
 7 | 
 8 | # Data training arguments
 9 | dataset_name: open-r1/OpenR1-Math-220k 
10 | dataset_num_proc: 48
11 | 
12 | #SFT hyperparam
13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
14 | weight_decay: 0.0001
15 | optim: adamw_torch
16 | lr_scheduler_type: linear
17 | warmup_ratio: 0.1
18 | learning_rate: 5.0e-05
19 | gradient_accumulation_steps: 2
20 | per_device_eval_batch_size: 4
21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
22 | 
23 | # SFT trainer config
24 | max_steps: -1
25 | num_train_epochs: 3
26 | bf16: true
27 | do_eval: false
28 | eval_strategy: 'no'
29 | gradient_checkpointing: true
30 | gradient_checkpointing_kwargs:
31 |   use_reentrant: false
32 | hub_model_id: OpenR1-Qwen-7B-SFT
33 | hub_strategy: every_save
34 | log_level: info
35 | logging_steps: 5
36 | logging_strategy: steps
37 | packing: true
38 | output_dir: data/OpenR1-Qwen-7B-SFT
39 | overwrite_output_dir: true
40 | push_to_hub: true
41 | report_to:
42 | - wandb
43 | save_strategy: "steps"
44 | save_steps: 500
45 | save_total_limit: 1
46 | seed: 42
47 | 


--------------------------------------------------------------------------------
/open-r1/recipes/accelerate_configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/open-r1/recipes/accelerate_configs/fsdp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | enable_cpu_affinity: false
 6 | fsdp_config:
 7 |   fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
 8 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 9 |   fsdp_backward_prefetch: BACKWARD_PRE
10 |   fsdp_cpu_ram_efficient_loading: true
11 |   fsdp_forward_prefetch: true
12 |   fsdp_offload_params: false
13 |   fsdp_sharding_strategy: FULL_SHARD
14 |   fsdp_state_dict_type: FULL_STATE_DICT
15 |   fsdp_sync_module_states: true
16 |   fsdp_use_orig_params: true
17 | machine_rank: 0
18 | main_training_function: main
19 | mixed_precision: bf16
20 | num_machines: 1
21 | num_processes: 8
22 | rdzv_backend: static
23 | same_network: true
24 | tpu_env: []
25 | tpu_use_cluster: false
26 | tpu_use_sudo: false
27 | use_cpu: false


--------------------------------------------------------------------------------
/open-r1/recipes/accelerate_configs/zero1.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 1
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/open-r1/recipes/accelerate_configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/open-r1/recipes/accelerate_configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/open-r1/scripts/get_tensor_parallel_size.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoConfig
 3 | from math import gcd
 4 | 
 5 | def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int:
 6 |     try:
 7 |         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
 8 |         num_heads = getattr(config, 'num_attention_heads', None)
 9 | 
10 |         if num_heads is not None and num_heads % default_tp != 0:
11 |             tp = gcd(num_heads, default_tp)
12 |             return max(tp, 1)
13 |         else:
14 |             return default_tp
15 |     except Exception as e:
16 |         print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}")
17 |         return default_tp
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path")
22 |     parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable")
23 |     parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)")
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp)
28 |     print(tp)
29 | 


--------------------------------------------------------------------------------
/open-r1/scripts/upload_details.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Push the details from a LightEval run to the Hub.
17 | 
18 | Usage:
19 | 
20 | python src/open_r1/utils/upload_details.py \
21 |     --data_files {path_to_parquet_file} \
22 |     --hub_repo_id {hub_repo_id} \
23 |     --config_name {config_name}
24 | """
25 | 
26 | from dataclasses import dataclass, field
27 | from typing import List
28 | 
29 | from datasets import load_dataset
30 | from transformers import HfArgumentParser
31 | 
32 | 
33 | @dataclass
34 | class ScriptArguments:
35 |     data_files: List[str] = field(default_factory=list)
36 |     hub_repo_id: str = None
37 |     config_name: str = None
38 | 
39 | 
40 | def main():
41 |     parser = HfArgumentParser(ScriptArguments)
42 |     args = parser.parse_args_into_dataclasses()[0]
43 | 
44 |     if all(file.endswith(".json") for file in args.data_files):
45 |         ds = load_dataset("json", data_files=args.data_files)
46 |     elif all(file.endswith(".jsonl") for file in args.data_files):
47 |         ds = load_dataset("json", data_files=args.data_files)
48 |     else:
49 |         ds = load_dataset("parquet", data_files=args.data_files)
50 |     url = ds.push_to_hub(args.hub_repo_id, config_name=args.config_name, private=True)
51 |     print(f"Dataset available at: {url}")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/open-r1/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/open-r1/slurm/README.md:
--------------------------------------------------------------------------------
 1 | ## Serving DeepSeek-R1 on 2x8 H100 SLURM nodes with SGLang 
 2 | 
 3 | 1. Set up the environment (adjust for your cuda version):
 4 | ```bash
 5 | conda create -n sglang124 python=3.11
 6 | conda activate sglang124
 7 | 
 8 | pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
 9 | 
10 | pip install sgl-kernel --force-reinstall --no-deps
11 | pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
12 | ```
13 | 
14 | 2. Run the server and wait for the model to load:
15 | ```bash
16 | sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
17 | ```
18 | 
19 | 3. Run the data generation script:
20 | ```bash
21 | python scripts/generate_reasoning.py \
22 |     --dataset-name "AI-MO/NuminaMath-1.5" \
23 |     --output-file "numinamath_r1_generations.jsonl" \
24 |     --prompt-column "problem" \
25 |     --uuid-column "problem" \
26 |     --api-addr "<SGLANG_SERVER_ADDRESS>:39877" \
27 |     --num-generations 2 \
28 |     --max-tokens 16384 \
29 |     --max-concurrent 200
30 | ```


--------------------------------------------------------------------------------
/open-r1/slurm/piston/launch_piston_workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this simple script will launch a bunch of piston workers on the HF science cluster
 4 | 
 5 | N_INSTANCES=${1:-5}  # Default to 5 instances
 6 | 
 7 | for i in $(seq 1 $N_INSTANCES); do
 8 |     # Find random (hopefully) available port
 9 |     PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1)
10 |     
11 |     # the job name format is important for the code to then be able to get a list of workers. `piston-worker-<port>`
12 |     sbatch \
13 |         --job-name="piston-worker-$PORT" \
14 |         --export=ALL,PORT=$PORT \
15 |         slurm/piston/launch_single_piston.sh
16 | done


--------------------------------------------------------------------------------
/open-r1/slurm/piston/launch_single_piston.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=piston_worker
 3 | #SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out
 4 | #SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out  # Redirect error logs to .out
 5 | #SBATCH --cpus-per-task=2
 6 | #SBATCH --mem-per-cpu=1950M
 7 | #SBATCH --partition=hopper-cpu
 8 | #SBATCH --time=48:00:00
 9 | 
10 | # sometimes if a bunch of workers start at the same time pyxis dies
11 | sleep $(( RANDOM % 20 ))
12 | 
13 | # mounting the packages folder lets us not have to manually install the package on each instance
14 | # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
15 | # feel free try with the latest image
16 | # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
17 | srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
18 |     bash -c "
19 |     export PISTON_COMPILE_TIMEOUT=60000
20 |     export PISTON_RUN_TIMEOUT=60000
21 |     export PISTON_OUTPUT_MAX_SIZE=1000000000
22 |     export PISTON_MAX_FILE_SIZE=1000000000
23 |     export PISTON_DISABLE_NETWORKING=true
24 |     export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
25 | 
26 |     sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
27 |     sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
28 | 
29 |     # Start server in background
30 |     node src
31 |     "
32 | 


--------------------------------------------------------------------------------
/open-r1/slurm/serve_router.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=r1-router
 3 | #SBATCH --partition=hopper-cpu
 4 | #SBATCH --qos=high
 5 | #SBATCH --nodes=1
 6 | #SBATCH --cpus-per-task=8
 7 | #SBATCH --mem-per-cpu=1875m
 8 | #SBATCH --output=./logs/%x_%j_%n.out
 9 | #SBATCH --error=./logs/%x_%j_%n.err
10 | #SBATCH --time=30-00:00:00
11 | #SBATCH --requeue
12 | 
13 | set -exuo pipefail
14 | 
15 | # TODO: Adjust these variables to your cluster configuration
16 | CONDA_ENV="sglang124"
17 | ROUTER_PORT=39876
18 | 
19 | trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
20 | 
21 | while getopts "e:h" opt; do
22 |     case $opt in
23 |         e) CONDA_ENV="$OPTARG" ;;
24 |         h|?) echo "Usage: sbatch $0 [-e CONDA_ENV]"; exit 1 ;;
25 |     esac
26 | done
27 | 
28 | # TODO: Environment setup, adjust to your cluster configuration
29 | source ~/.bashrc
30 | source "$CONDA_PREFIX/etc/profile.d/conda.sh"
31 | conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
32 | 
33 | python -m sglang_router.launch_router \
34 |     --port "$ROUTER_PORT" \
35 |     --host 0.0.0.0 \
36 |     --worker-startup-timeout-secs 300
37 | 
38 | # Keep the job running with health checks
39 | while true; do
40 |     if ! curl -s -o /dev/null "http://localhost:$ROUTER_PORT/health"; then
41 |         echo "Error: Router health check failed"
42 |         exit 1
43 |     fi
44 |     sleep 300
45 | done


--------------------------------------------------------------------------------
/open-r1/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/open-r1/src/open_r1/test_dataset.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | import torch
 3 | from open_r1.utils.data_utils import custom_loading_dataset
 4 | 
 5 | # 加载预训练的分词器
 6 | tokenizer = AutoTokenizer.from_pretrained("models/Qwen2.5-Math-7B")
 7 | dataset = custom_loading_dataset("datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5", tokenizer=tokenizer)
 8 | 
 9 | 
10 | def make_conversation_math35(example):
11 |     prompt = []
12 |     # prompt.append({"role": "user", "content": example["instruction"][0]['content']})
13 |     prompt = example["instruction"][0]['content']
14 |     # prompt.append({"role": "user", "content": example["problem"]})
15 |     return {"prompt": prompt}
16 | 
17 | dataset = dataset.map(make_conversation_math35)
18 | 
19 | # 初始化最大长度变量
20 | max_length = 0
21 | 
22 | # 遍历数据集，计算每个样本的长度
23 | for text in dataset['train']:
24 |     # 使用分词器对文本进行编码
25 |     text = text['prompt']
26 |     print(text)
27 |     inputs = tokenizer(text, return_tensors="pt", padding=False, truncation=False)
28 |     # 获取输入的长度
29 |     length = inputs["input_ids"].shape[1]
30 |     # 更新最大长度
31 |     if length > max_length:
32 |         max_length = length
33 | 
34 | print(f"Maximum length after tokenization: {max_length}")


--------------------------------------------------------------------------------
/open-r1/src/open_r1/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .import_utils import is_e2b_available
2 | from .model_utils import get_tokenizer
3 | 
4 | 
5 | __all__ = ["get_tokenizer", "is_e2b_available"]
6 | 


--------------------------------------------------------------------------------
/open-r1/src/open_r1/utils/import_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from transformers.utils.import_utils import _is_package_available
16 | 
17 | 
18 | # Use same as transformers.utils.import_utils
19 | _e2b_available = _is_package_available("e2b")
20 | 
21 | 
22 | def is_e2b_available() -> bool:
23 |     return _e2b_available
24 | 


--------------------------------------------------------------------------------
/open-r1/src/open_r1/utils/ioi/__init__.py:
--------------------------------------------------------------------------------
 1 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
 2 | from .scoring import SubtaskResult, score_subtask
 3 | from .utils import add_includes
 4 | 
 5 | 
 6 | __all__ = [
 7 |     "get_piston_client_from_env",
 8 |     "get_slurm_piston_endpoints",
 9 |     "score_subtask",
10 |     "add_includes",
11 |     "SubtaskResult",
12 | ]
13 | 


--------------------------------------------------------------------------------
/open-r1/src/open_r1/utils/ioi/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from functools import lru_cache
 3 | from itertools import islice
 4 | 
 5 | from datasets import load_dataset
 6 | 
 7 | 
 8 | def add_includes(code: str, problem_id: str) -> str:
 9 |     """
10 |     Fix common compilation errors for IOI problems.
11 |     """
12 |     if not code:
13 |         return code
14 |     # has most of the useful functions
15 |     code_header = "#include <bits/stdc++.h>\n"
16 |     # include the problem header
17 |     problem_header_include = f'#include "{problem_id}.h"'
18 |     if problem_header_include not in code:
19 |         code_header += problem_header_include + "\n"
20 |     # use namespace std since models forget std:: often
21 |     if "using namespace std;" not in code and "std::" not in code:
22 |         code_header += "\nusing namespace std;\n\n"
23 |     return code_header + code
24 | 
25 | 
26 | @lru_cache
27 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
28 |     """
29 |     Load IOI tests for a given year.
30 |     """
31 |     tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
32 |     test_cases = defaultdict(dict)
33 |     for test_case in tests_dataset:
34 |         test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
35 |     return test_cases
36 | 
37 | 
38 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
39 |     """
40 |     Load IOI tests for a given year and problem id.
41 |     """
42 |     return load_ioi_tests_for_year(year)[problem_id]
43 | 
44 | 
45 | def batched(iterable, n):
46 |     "Batch data into lists of length n. The last batch may be shorter."
47 |     # batched('ABCDEFG', 3) --> ABC DEF G
48 |     if n < 1:
49 |         return iterable
50 |     it = iter(iterable)
51 |     while batch := list(islice(it, n)):
52 |         yield batch
53 | 


--------------------------------------------------------------------------------
/open-r1/src/open_r1/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, PreTrainedTokenizer
 2 | 
 3 | from trl import ModelConfig
 4 | 
 5 | from ..configs import GRPOConfig, SFTConfig
 6 | 
 7 | 
 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 9 | 
10 | 
11 | def get_tokenizer(
12 |     model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True
13 | ) -> PreTrainedTokenizer:
14 |     """Get the tokenizer for the model."""
15 |     tokenizer = AutoTokenizer.from_pretrained(
16 |         model_args.model_name_or_path,
17 |         revision=model_args.model_revision,
18 |         trust_remote_code=model_args.trust_remote_code,
19 |     )
20 | 
21 |     if training_args.chat_template is not None:
22 |         tokenizer.chat_template = training_args.chat_template
23 |     elif auto_set_chat_template and tokenizer.get_chat_template() is None:
24 |         tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
25 | 
26 |     return tokenizer
27 | 


--------------------------------------------------------------------------------
/open-r1/src/open_r1/utils/wandb_logging.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def init_wandb_training(training_args):
 5 |     """
 6 |     Helper function for setting up Weights & Biases logging tools.
 7 |     """
 8 |     if training_args.wandb_entity is not None:
 9 |         os.environ["WANDB_ENTITY"] = training_args.wandb_entity
10 |     if training_args.wandb_project is not None:
11 |         os.environ["WANDB_PROJECT"] = training_args.wandb_project
12 | 


--------------------------------------------------------------------------------
/open-r1/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/open-r1/tests/__init__.py


--------------------------------------------------------------------------------
/open-r1/tests/transformer_ds_qwen_15B_R1.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_params:
 3 |     model_args: "pretrained=models/DeepSeek-R1-Distill-Qwen-1.5B" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
 4 |     dtype: "bfloat16"
 5 |     compile: false
 6 |   merged_weights: # Ignore this section if you are not using PEFT models
 7 |     delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
 8 |     adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
 9 |     base_model: null # path to the base_model
10 |   generation:
11 | #    multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
12 |     max_new_tokens: 4096 #32768 we use a small to control the infer speed.
13 |     temperature: 0.6
14 |     top_p: 0.95
15 | 


--------------------------------------------------------------------------------
/open-r1/tests/transformer_ds_qwen_15B_R1_retrain.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_params:
 3 |     model_args: "your_path/Qwen2.5-1.5B-Open-R1-GPG" # in fact this GRPO from scratch.
 4 |     dtype: "bfloat16"
 5 |     compile: false
 6 |   merged_weights: # Ignore this section if you are not using PEFT models
 7 |     delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
 8 |     adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
 9 |     base_model: null # path to the base_model
10 |   generation:
11 | #    multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
12 |     max_new_tokens: 4096 #32768 we use a small to control the infer speed.
13 |     temperature: 0.6
14 |     top_p: 0.95
15 | 


--------------------------------------------------------------------------------
/open-r1/train.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=src
2 | 
3 | accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
4 |   --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
5 |   src/open_r1/gpg.py --config   recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1.yaml --output_dir  Your_Path \
6 |   --save_strategy "epoch" --save_total_limit  5 --num_train_epochs 5 --gradient_accumulation_steps 4 --max_completion_length 2048 --max_prompt_length 768 \
7 |   --scale_rewards False --adjust_gd --min_inverse_alpha 0.5 --eval_strategy epoch \


--------------------------------------------------------------------------------
/open-rs/recipes/accelerate_configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/open-rs/recipes/accelerate_configs/fsdp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | enable_cpu_affinity: false
 6 | fsdp_config:
 7 |   fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
 8 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 9 |   fsdp_backward_prefetch: BACKWARD_PRE
10 |   fsdp_cpu_ram_efficient_loading: true
11 |   fsdp_forward_prefetch: true
12 |   fsdp_offload_params: false
13 |   fsdp_sharding_strategy: FULL_SHARD
14 |   fsdp_state_dict_type: FULL_STATE_DICT
15 |   fsdp_sync_module_states: true
16 |   fsdp_use_orig_params: true
17 | machine_rank: 0
18 | main_training_function: main
19 | mixed_precision: bf16
20 | num_machines: 1
21 | num_processes: 8
22 | rdzv_backend: static
23 | same_network: true
24 | tpu_env: []
25 | tpu_use_cluster: false
26 | tpu_use_sudo: false
27 | use_cpu: false


--------------------------------------------------------------------------------
/open-rs/recipes/accelerate_configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/open-rs/recipes/accelerate_configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/open-rs/recipes/data_cleaner.yaml:
--------------------------------------------------------------------------------
 1 | model_kwargs:
 2 |   model: Qwen/Qwen2.5-Math-7B-Instruct
 3 |   trust_remote_code: true
 4 |   max_model_len: 4096
 5 |   gpu_memory_utilization: 0.9
 6 |   enforce_eager: true
 7 |   tensor_parallel_size: 4
 8 | 
 9 | sampling_params:
10 |   temperature: 0.7
11 |   top_p: 0.9
12 |   max_tokens: 4096
13 | 


--------------------------------------------------------------------------------
/open-rs/recipes/gpg.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/open-s1
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 100
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GPG-RS1
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-rs/recipes/gpg_7B.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/open-s1
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GPG-7B
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-rs/recipes/gpg_std.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/open-s1
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GPG-std-new
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1
57 | scale_rewards: false


--------------------------------------------------------------------------------
/open-rs/recipes/grpo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | # model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B
 4 | model_revision: main
 5 | torch_dtype: bfloat16
 6 | attn_implementation: flash_attention_2
 7 | 
 8 | # Data training arguments
 9 | dataset_name: datas/open-rs/open-s1
10 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
11 | 
12 | # GRPO trainer config
13 | bf16: true
14 | use_vllm: false
15 | vllm_device: auto
16 | vllm_enforce_eager: true
17 | vllm_gpu_memory_utilization: 0.7
18 | vllm_max_model_len: 4608
19 | do_eval: false
20 | gradient_accumulation_steps: 4
21 | gradient_checkpointing: true
22 | gradient_checkpointing_kwargs:
23 |   use_reentrant: false
24 | hub_model_id: OpenRS-GRPO
25 | hub_strategy: every_save
26 | learning_rate: 1.0e-06
27 | log_completions: true
28 | log_level: info
29 | logging_first_step: true
30 | logging_steps: 1
31 | logging_strategy: steps
32 | lr_scheduler_type: cosine_with_min_lr
33 | lr_scheduler_kwargs:
34 |   min_lr_rate: 0.1
35 | max_prompt_length: 512
36 | max_completion_length: 3584
37 | max_steps: 500
38 | num_generations: 6
39 | num_train_epochs: 1
40 | output_dir: data/OpenRS-GRPO
41 | overwrite_output_dir: true
42 | per_device_eval_batch_size: 6
43 | per_device_train_batch_size: 6
44 | push_to_hub: false
45 | report_to:
46 | - tensorboard
47 | reward_funcs:
48 | - format
49 | - cosine
50 | reward_weights:
51 | - 1.0
52 | - 2.0
53 | save_strategy: "steps"
54 | save_steps: 50
55 | seed: 42
56 | temperature: 0.7
57 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-rs/recipes/grpo_7B.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/open-s1
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GRPO-7B
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-rs/recipes/grpo_ng.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/open-s1
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 100
37 | num_generations: 2
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GRPO-rs1-ng2
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 18
42 | per_device_train_batch_size: 18
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-rs/recipes/grpo_wo_vllm.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B
 3 | model_revision: main
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | dataset_name: datas/open-rs/open-s1
 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
10 | 
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: false
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 |   use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 |   min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 100
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GRPO-rs3
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: false
44 | report_to:
45 | - tensorboard
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1


--------------------------------------------------------------------------------
/open-rs/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/open-rs/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/open-rs/src/open_r1/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpg_trainer import GPGTrainer
2 | from .gpg_std_trainer import GPGSTDTrainer
3 | __all__ = ["GPGTrainer","GPGSTDTrainer"]


--------------------------------------------------------------------------------
/open-rs/src/open_r1/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .import_utils import is_e2b_available
2 | from .model_utils import get_tokenizer
3 | 
4 | 
5 | __all__ = ["get_tokenizer", "is_e2b_available"]
6 | 


--------------------------------------------------------------------------------
/open-rs/src/open_r1/utils/import_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from transformers.utils.import_utils import _is_package_available
16 | 
17 | 
18 | # Use same as transformers.utils.import_utils
19 | _e2b_available = _is_package_available("e2b")
20 | 
21 | 
22 | def is_e2b_available() -> bool:
23 |     return _e2b_available
24 | 


--------------------------------------------------------------------------------
/open-rs/src/open_r1/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, PreTrainedTokenizer
 2 | 
 3 | from trl import ModelConfig
 4 | 
 5 | from ..configs import GRPOConfig, SFTConfig
 6 | 
 7 | 
 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 9 | 
10 | 
11 | def get_tokenizer(
12 |     model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True
13 | ) -> PreTrainedTokenizer:
14 |     """Get the tokenizer for the model."""
15 |     tokenizer = AutoTokenizer.from_pretrained(
16 |         model_args.model_name_or_path,
17 |         revision=model_args.model_revision,
18 |         trust_remote_code=model_args.trust_remote_code,
19 |     )
20 | 
21 |     if training_args.chat_template is not None:
22 |         tokenizer.chat_template = training_args.chat_template
23 |     elif auto_set_chat_template and tokenizer.get_chat_template() is None:
24 |         tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
25 | 
26 |     return tokenizer
27 | 


--------------------------------------------------------------------------------
/open-rs/src/open_r1/utils/wandb_logging.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def init_wandb_training(training_args):
 5 |     """
 6 |     Helper function for setting up Weights & Biases logging tools.
 7 |     """
 8 |     if training_args.wandb_entity is not None:
 9 |         os.environ["WANDB_ENTITY"] = training_args.wandb_entity
10 |     if training_args.wandb_project is not None:
11 |         os.environ["WANDB_PROJECT"] = training_args.wandb_project
12 | 


--------------------------------------------------------------------------------
/open-rs/train.sh:
--------------------------------------------------------------------------------
1 | ACCELERATE_LOG_LEVEL=info accelerate launch \
2 |   --config_file recipes/accelerate_configs/zero2.yaml \
3 |   --num_processes=16 \
4 |   src/open_r1/gpg.py \
5 |   --config recipes/gpg.yaml & >> open-rs1-gpg.log


--------------------------------------------------------------------------------