├── R1-V ├── .gitignore ├── README.md ├── images │ ├── counting_star.png │ ├── curves.png │ ├── grpo_trainer_log.jpg │ ├── ood.png │ ├── super_ood.png │ ├── train-test.png │ ├── training.png │ └── vllm_grpo_trainer_modified_log.jpg ├── requirements.txt ├── setup.sh ├── src │ ├── distill_r1 │ │ ├── README.md │ │ ├── create_hf_dataset.py │ │ ├── filter_r1.py │ │ ├── generate_scene_qa_pairs.ipynb │ │ ├── grpo_r1_distilled.jpg │ │ ├── prompt.py │ │ └── query_r1.py │ ├── eval │ │ ├── logs │ │ │ ├── counting_results_superclevr_200_qwen2vl_2b_instruct_grpo100_legacy.json │ │ │ ├── counting_results_superclevr_200_qwen2vl_2b_instruct_legacy.json │ │ │ ├── geoqa_test_qwen2vl_7b_grpo_2epochs_legacy.json │ │ │ └── geoqa_test_qwen2vl_7b_instruct_legacy.json │ │ ├── prompts │ │ │ ├── geoqa_test_prompts.jsonl │ │ │ └── superclevr_test200_counting_problems.jsonl │ │ ├── test_qwen2vl_counting_superclevr.py │ │ ├── test_qwen2vl_geoqa.py │ │ └── test_qwen2vl_geoqa_multigpu.py │ ├── r1-v │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── Makefile │ │ ├── configs │ │ │ ├── ddp.yaml │ │ │ ├── qwen2vl_sft_config.yaml │ │ │ ├── zero2.yaml │ │ │ └── zero3.yaml │ │ ├── local_scripts │ │ │ ├── create_vision_cot_data.py │ │ │ ├── lmms_eval_qwen2vl.sh │ │ │ ├── prepare_hf_data.py │ │ │ ├── train_aria_moe.sh │ │ │ ├── train_qwen2_vl.sh │ │ │ ├── zero1_no_optimizer.json │ │ │ ├── zero2.json │ │ │ ├── zero3.json │ │ │ ├── zero3.yaml │ │ │ └── zero3_offload.json │ │ ├── run_grpo.sh │ │ ├── setup.cfg │ │ ├── setup.py │ │ ├── src │ │ │ └── open_r1 │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluate.py │ │ │ │ ├── generate.py │ │ │ │ ├── grpo.py │ │ │ │ ├── sft.py │ │ │ │ └── trainer │ │ │ │ ├── __init__.py │ │ │ │ ├── grpo_trainer.py │ │ │ │ ├── vllm_grpo_trainer.py │ │ │ │ └── vllm_grpo_trainer_modified.py │ │ └── temp_image.png │ ├── requirements.txt │ └── scripts │ │ └── run_grpo_GEOQA_qwen2.5_3b.sh └── test.sh ├── README.md ├── Visual-RFT ├── .gitignore ├── LICENSE ├── README.md ├── assets │ ├── case_cls.png │ ├── case_lisa.png │ ├── framework.png │ ├── pokeymon.jpg │ ├── radar.png │ └── teaser.png ├── classification │ ├── Qwen2_VL_classification_infere.py │ └── val_data │ │ ├── fgvc_aircraft.pth │ │ ├── fgvc_aircraft.txt │ │ ├── oxford_flowers.pth │ │ ├── oxford_flowers.txt │ │ ├── pets.pth │ │ ├── pets.txt │ │ ├── stanford_cars.pth │ │ └── stanford_cars.txt ├── coco_evaluation │ ├── Qwen2_VL_coco_infere.py │ ├── coco_evaluation.py │ ├── evaluation.ipynb │ ├── exist_map_coco_Qwen2_vl_2B_baseline.json │ └── exist_map_coco_Qwen2_vl_7B_baseline.json ├── dataset │ ├── README.md │ └── build_dataset.ipynb ├── demo │ ├── README.md │ └── lisa_demo.ipynb ├── lisa_evaluation │ ├── Qwen2_VL_lisa_infere.py │ ├── Qwen2_VL_lisa_infere.sh │ ├── README.md │ ├── box2mask.py │ ├── evaluation.ipynb │ ├── gen_box_ann.py │ ├── gen_sft.py │ ├── mask_iou.py │ └── merge_eval.py ├── lvis_evaluation │ ├── Qwen2_VL_lvis_infere.py │ ├── exist_map_lvis_Qwen2_vl_2B_baseline.json │ ├── exist_map_lvis_Qwen2_vl_7B_baseline.json │ └── lvis_evaluation.ipynb ├── q&a.md ├── requirements.txt ├── setup.sh ├── src │ ├── scripts │ │ ├── 2B_aircraft_4_shot.sh │ │ ├── 2B_car196_4_shot.sh │ │ ├── 2B_flower_4_shot.sh │ │ ├── 2B_lisa_grounding.sh │ │ └── 2B_pets37_4_shot.sh │ └── virft │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── Makefile │ │ ├── README.md │ │ ├── configs │ │ ├── ddp.yaml │ │ ├── zero2.yaml │ │ └── zero3.yaml │ │ ├── local_scripts │ │ ├── create_vision_cot_data.py │ │ ├── lmms_eval_qwen2vl.sh │ │ ├── prepare_hf_data.py │ │ ├── train_aria_moe.sh │ │ ├── train_qwen2_vl.sh │ │ ├── zero2.json │ │ ├── zero3.json │ │ ├── zero3.yaml │ │ └── zero3_offload.json │ │ ├── setup.cfg │ │ ├── setup.py │ │ ├── slurm │ │ ├── evaluate.slurm │ │ ├── generate.slurm │ │ └── sft.slurm │ │ └── src │ │ └── open_r1 │ │ ├── __init__.py │ │ ├── evaluate.py │ │ ├── generate.py │ │ ├── grpo.py │ │ ├── grpo_classification.py │ │ ├── grpo_lisa.py │ │ ├── sft.py │ │ └── trainer │ │ ├── __init__.py │ │ ├── grpo_trainer.py │ │ └── vllm_grpo_trainer.py └── test.sh ├── VisualThinker-R1-Zero ├── .gitignore ├── README.md ├── requirements.txt ├── setup.sh ├── src │ ├── data │ │ └── SAT │ │ │ ├── prepare_dataset.sh │ │ │ └── process_dataset.py │ ├── eval │ │ ├── evaluate_Qwen2_VL_CVBench-base.py │ │ └── evaluate_Qwen2_VL_CVBench.py │ └── open-r1-multimodal │ │ ├── LICENSE │ │ ├── Makefile │ │ ├── README.md │ │ ├── configs │ │ ├── ddp.yaml │ │ ├── zero2.yaml │ │ └── zero3.yaml │ │ ├── prepare_2B_base.sh │ │ ├── run_grpo.sh │ │ ├── run_grpo_SAT.sh │ │ ├── run_sft.sh │ │ ├── run_sft_SAT.sh │ │ ├── setup.cfg │ │ ├── setup.py │ │ ├── src │ │ └── open_r1 │ │ │ ├── __init__.py │ │ │ ├── evaluate.py │ │ │ ├── generate.py │ │ │ ├── grpo.py │ │ │ ├── sft.py │ │ │ └── trainer │ │ │ ├── InternVL2.py │ │ │ ├── __init__.py │ │ │ └── grpo_trainer.py │ │ └── test.py └── test.sh ├── docs └── images │ ├── GPG.png │ └── figure0.svg ├── open-r1 ├── .github │ ├── dependabot.yml │ └── workflows │ │ └── tests.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── assets │ └── plan-of-attack.png ├── recipes │ ├── DeepSeek-R1-Distill-Qwen-1.5B │ │ └── grpo │ │ │ ├── config_demo.yaml │ │ │ ├── config_demo_v1.yaml │ │ │ └── config_demo_v2.yaml │ ├── Mistral-Small-24B-Instruct-2501 │ │ └── sft │ │ │ └── config_openr1_math.yaml │ ├── OlympicCoder-32B │ │ └── sft │ │ │ └── config_v00.00.yaml │ ├── OlympicCoder-7B │ │ └── sft │ │ │ └── config_v00.00.yaml │ ├── OpenR1-Qwen-7B │ │ ├── gpg │ │ │ ├── config_v0.yaml │ │ │ ├── config_v0_ds.yaml │ │ │ ├── config_v1.yaml │ │ │ ├── config_v1_ds.yaml │ │ │ └── config_v2_ds.yaml │ │ └── sft │ │ │ ├── config.yaml │ │ │ ├── config_v0.yaml │ │ │ ├── config_v1.yaml │ │ │ └── config_v2.yaml │ ├── Qwen2.5-1.5B-Instruct │ │ ├── gpg │ │ │ ├── config_demo_v1.yaml │ │ │ ├── config_demo_v2.yaml │ │ │ ├── config_demo_v3.yaml │ │ │ ├── config_demo_v4.yaml │ │ │ ├── config_demo_v5.yaml │ │ │ ├── config_demo_woSTD.yaml │ │ │ ├── config_v0.yaml │ │ │ ├── config_v0_nostd.yaml │ │ │ └── config_v0_open22k.yaml │ │ ├── grpo │ │ │ ├── config_demo.yaml │ │ │ ├── config_demo_code.yaml │ │ │ ├── config_demo_code_ioi.yaml │ │ │ ├── config_demo_v1.yaml │ │ │ └── config_demo_v3.yaml │ │ └── sft │ │ │ ├── config_demo.yaml │ │ │ ├── config_demo_v1.yaml │ │ │ └── config_v1.yaml │ ├── Qwen2.5-7B-Instruct │ │ └── grpo │ │ │ ├── config_demo.yaml │ │ │ └── config_demo_v1.yaml │ ├── Qwen2.5-Math-7B │ │ └── grpo │ │ │ ├── config_simple_rl.yaml │ │ │ ├── config_simple_rl_dgrpo.yaml │ │ │ ├── config_simple_rl_gpg.yaml │ │ │ ├── config_simple_rl_gpg_3k.yaml │ │ │ ├── config_simple_rl_gpg_3k_2nodes.yaml │ │ │ ├── config_simple_rl_gpg_n16.yaml │ │ │ ├── config_simple_rl_gpg_n16_wostd.yaml │ │ │ ├── config_simple_rl_gpg_n2.yaml │ │ │ ├── config_simple_rl_gpg_n2_wostd.yaml │ │ │ ├── config_simple_rl_gpg_n4.yaml │ │ │ ├── config_simple_rl_gpg_n4_wostd.yaml │ │ │ ├── config_simple_rl_gpg_n8.yaml │ │ │ ├── config_simple_rl_gpg_scale_batch.yaml │ │ │ ├── config_simple_rl_math_l35.yaml │ │ │ ├── config_simple_rl_math_l35_v1.yaml │ │ │ ├── config_simple_rl_math_l35_v2.yaml │ │ │ ├── config_simple_rl_math_l35_v2_g16.yaml │ │ │ ├── config_simple_rl_v1.yaml │ │ │ ├── config_simple_rl_v1_kl.yaml │ │ │ ├── config_simple_rl_v1_nostd.yaml │ │ │ └── config_simple_rl_wokl.yaml │ ├── README.md │ ├── SmolLM2-1.7B-Instruct │ │ └── sft │ │ │ └── config.yaml │ ├── SmolLM2-1.7B │ │ └── sft │ │ │ └── config.yaml │ └── accelerate_configs │ │ ├── ddp.yaml │ │ ├── fsdp.yaml │ │ ├── zero1.yaml │ │ ├── zero2.yaml │ │ └── zero3.yaml ├── scripts │ ├── decontaminate.py │ ├── generate_reasoning.py │ ├── get_tensor_parallel_size.py │ ├── run_benchmarks.py │ └── upload_details.py ├── setup.cfg ├── setup.py ├── slurm │ ├── README.md │ ├── evaluate.slurm │ ├── experimental │ │ └── serve_r1_vllm.slurm │ ├── generate.slurm │ ├── piston │ │ ├── README.md │ │ ├── launch_piston_workers.sh │ │ └── launch_single_piston.sh │ ├── serve_r1.slurm │ ├── serve_router.slurm │ └── train.slurm ├── src │ └── open_r1 │ │ ├── __init__.py │ │ ├── configs.py │ │ ├── evaluate.py │ │ ├── evaluate_short.py │ │ ├── generate.py │ │ ├── gpg.py │ │ ├── gpg_trainer.py │ │ ├── grpo.py │ │ ├── rewards.py │ │ ├── sft.py │ │ ├── test_dataset.py │ │ └── utils │ │ ├── __init__.py │ │ ├── callbacks.py │ │ ├── data_utils.py │ │ ├── evaluation.py │ │ ├── hub.py │ │ ├── import_utils.py │ │ ├── ioi │ │ ├── __init__.py │ │ ├── piston_client.py │ │ ├── scoring.py │ │ └── utils.py │ │ ├── model_utils.py │ │ └── wandb_logging.py ├── tests │ ├── __init__.py │ ├── slow │ │ └── test_code_reward.py │ ├── test_rewards.py │ ├── transformer_ds_qwen_15B_R1.yaml │ └── transformer_ds_qwen_15B_R1_retrain.yaml └── train.sh └── open-rs ├── README.md ├── eval.sh ├── recipes ├── accelerate_configs │ ├── ddp.yaml │ ├── fsdp.yaml │ ├── zero2.yaml │ └── zero3.yaml ├── data_cleaner.yaml ├── gpg.yaml ├── gpg_7B.yaml ├── gpg_std.yaml ├── grpo.yaml ├── grpo_7B.yaml ├── grpo_ng.yaml └── grpo_wo_vllm.yaml ├── setup.cfg ├── setup.py ├── src └── open_r1 │ ├── __init__.py │ ├── configs.py │ ├── evaluate.py │ ├── generate.py │ ├── gpg.py │ ├── gpg_std.py │ ├── grpo.py │ ├── rewards.py │ ├── sft.py │ ├── trainer │ ├── __init__.py │ ├── gpg_std_trainer.py │ └── gpg_trainer.py │ └── utils │ ├── __init__.py │ ├── callbacks.py │ ├── evaluation.py │ ├── hub.py │ ├── import_utils.py │ ├── model_utils.py │ └── wandb_logging.py └── train.sh /R1-V/.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | hostfiles/ 3 | internal_scripts/ 4 | data/ 5 | output_onlypg/ 6 | output_grpo/ 7 | Geo170K/ 8 | src/eval/Geo170K/ 9 | src/eval/images/ 10 | src/eval/images.zip 11 | -------------------------------------------------------------------------------- /R1-V/images/counting_star.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/counting_star.png -------------------------------------------------------------------------------- /R1-V/images/curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/curves.png -------------------------------------------------------------------------------- /R1-V/images/grpo_trainer_log.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/grpo_trainer_log.jpg -------------------------------------------------------------------------------- /R1-V/images/ood.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/ood.png -------------------------------------------------------------------------------- /R1-V/images/super_ood.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/super_ood.png -------------------------------------------------------------------------------- /R1-V/images/train-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/train-test.png -------------------------------------------------------------------------------- /R1-V/images/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/training.png -------------------------------------------------------------------------------- /R1-V/images/vllm_grpo_trainer_modified_log.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/images/vllm_grpo_trainer_modified_log.jpg -------------------------------------------------------------------------------- /R1-V/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=1.2.1 2 | bitsandbytes>=0.43.0 3 | black>=24.4.2 4 | datasets>=3.2.0 5 | deepspeed==0.15.4 6 | distilabel[vllm,ray,openai]>=1.5.2 7 | einops>=0.8.0 8 | flake8>=6.0.0 9 | hf_transfer>=0.1.4 10 | huggingface-hub[cli]>=0.19.2,<1.0 11 | isort>=5.12.0 12 | liger_kernel==0.5.2 13 | # lighteval @ git+https://githubfast.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math] 14 | math-verify 15 | packaging>=23.0 16 | parameterized>=0.9.0 17 | pytest 18 | safetensors>=0.3.3 19 | sentencepiece>=0.1.99 20 | torch>=2.5.1 21 | transformers @ git+https://githubfast.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef 22 | trl==0.14.0 23 | vllm==0.6.6.post1 24 | wandb>=0.19.1 25 | pillow -------------------------------------------------------------------------------- /R1-V/setup.sh: -------------------------------------------------------------------------------- 1 | # Install the packages in r1-v . 2 | cd src/r1-v 3 | pip install -e ".[dev]" 4 | 5 | # Addtional modules 6 | pip install wandb==0.18.3 7 | pip install tensorboardx 8 | pip install qwen_vl_utils torchvision 9 | pip install flash-attn --no-build-isolation 10 | 11 | # vLLM support 12 | pip install vllm==0.7.2 13 | 14 | # fix transformers version 15 | pip install git+https://github.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef -------------------------------------------------------------------------------- /R1-V/src/distill_r1/grpo_r1_distilled.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/src/distill_r1/grpo_r1_distilled.jpg -------------------------------------------------------------------------------- /R1-V/src/distill_r1/prompt.py: -------------------------------------------------------------------------------- 1 | R1_SYS_PROMPT = """You are DeepSeek-R1, an AI assistant created exclusively by the Chinese Company DeepSeek. You'll provide helpful, harmless, and detailed responses to all user inquiries. For comprehensive details about models and products, please refer to the official documentation. 2 | 3 | Key Guidelines: 4 | Identity & Compliance 5 | 6 | Clearly state your identity as a DeepSeek AI assistant in initial responses. 7 | 8 | Comply with Chinese laws and regulations, including data privacy requirements. 9 | 10 | Capability Scope 11 | 12 | Handle both Chinese and English queries effectively 13 | 14 | Acknowledge limitations for real-time information post knowledge cutoff (2023-12) 15 | 16 | Provide technical explanations for AI-related questions when appropriate 17 | 18 | Response Quality 19 | 20 | Give comprehensive, logically structured answers 21 | 22 | Use markdown formatting for clear information organization 23 | 24 | Admit uncertainties for ambiguous queries 25 | 26 | Ethical Operation 27 | 28 | Strictly refuse requests involving illegal activities, violence, or explicit content 29 | 30 | Maintain political neutrality according to company guidelines 31 | 32 | Protect user privacy and avoid data collection 33 | 34 | Specialized Processing 35 | 36 | Use ... tags for internal reasoning before responding 37 | 38 | Employ XML-like tags for structured output when required 39 | """ -------------------------------------------------------------------------------- /R1-V/src/r1-v/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style quality 2 | 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) 4 | export PYTHONPATH = src 5 | 6 | check_dirs := src 7 | 8 | style: 9 | black --line-length 119 --target-version py310 $(check_dirs) setup.py 10 | isort $(check_dirs) setup.py 11 | 12 | quality: 13 | black --check --line-length 119 --target-version py310 $(check_dirs) setup.py 14 | isort --check-only $(check_dirs) setup.py 15 | flake8 --max-line-length 119 $(check_dirs) setup.py 16 | 17 | 18 | # Evaluation 19 | 20 | evaluate: 21 | -------------------------------------------------------------------------------- /R1-V/src/r1-v/configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /R1-V/src/r1-v/configs/qwen2vl_sft_config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2-VL-2B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | 6 | # Data training arguments 7 | dataset_name: MMInstruction/Clevr_CoGenT_TrainA_R1 8 | dataset_configs: 9 | - all 10 | preprocessing_num_workers: 8 11 | 12 | # SFT trainer config 13 | bf16: true 14 | do_eval: true 15 | eval_strategy: "no" 16 | gradient_accumulation_steps: 4 17 | gradient_checkpointing: true 18 | gradient_checkpointing_kwargs: 19 | use_reentrant: false 20 | hub_model_id: Qwen2-VL-2B-Instruct-SFT 21 | hub_strategy: every_save 22 | learning_rate: 2.0e-05 23 | log_level: info 24 | logging_steps: 5 25 | logging_strategy: steps 26 | lr_scheduler_type: cosine 27 | packing: true 28 | max_seq_length: 4096 29 | max_steps: -1 30 | num_train_epochs: 1 31 | output_dir: data/Qwen2-VL-2B-Instruct-SFT 32 | overwrite_output_dir: true 33 | per_device_eval_batch_size: 4 34 | per_device_train_batch_size: 4 35 | push_to_hub: true 36 | report_to: 37 | - wandb 38 | save_strategy: "no" 39 | seed: 42 40 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /R1-V/src/r1-v/configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /R1-V/src/r1-v/configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/lmms_eval_qwen2vl.sh: -------------------------------------------------------------------------------- 1 | export HF_HOME="" 2 | export HF_TOKEN="" 3 | export HF_HUB_ENABLE_HF_TRANSFER="1" 4 | 5 | export API_TYPE="" 6 | export AZURE_ENDPOINT="" 7 | export AZURE_API_KEY="" 8 | export API_VERSION="" 9 | export MODEL_VERSION="" 10 | export NAVIT_ATTENTION_IMPLEMENTATION="eager" 11 | 12 | # Prompt for installation with 3-second timeout 13 | read -t 3 -p "Do you want to install dependencies? (YES/no, timeout in 3s): " install_deps || true 14 | if [ "$install_deps" = "YES" ]; then 15 | # Prepare the environment 16 | pip3 install --upgrade pip 17 | pip3 install -U setuptools 18 | 19 | cd 20 | if [ ! -d "maas_engine" ]; then 21 | git clone 22 | else 23 | echo "maas_engine directory already exists, skipping clone" 24 | fi 25 | cd maas_engine 26 | git pull 27 | git checkout 28 | pip3 install --no-cache-dir --no-build-isolation -e ".[standalone]" 29 | 30 | current_version=$(pip3 show transformers | grep Version | cut -d' ' -f2) 31 | if [ "$current_version" != "4.46.2" ]; then 32 | echo "Installing transformers 4.46.2 (current version: $current_version)" 33 | pip3 install transformers==4.46.2 34 | else 35 | echo "transformers 4.46.2 is already installed" 36 | fi 37 | 38 | cd 39 | rm -rf 40 | pip3 install -e . 41 | pip3 install -U pydantic 42 | pip3 install Levenshtein 43 | pip3 install nltk 44 | python3 -c "import nltk; nltk.download('wordnet', quiet=True); nltk.download('punkt', quiet=True)" 45 | fi 46 | 47 | TASKS=mmmu_val,mathvista_testmini,mmmu_pro 48 | MODEL_BASENAME=qwen2_vl 49 | 50 | model_checkpoint="" 51 | echo "MODEL_BASENAME: ${MODEL_BASENAME}" 52 | cd 53 | 54 | python3 -m accelerate.commands.launch --num_processes=8 --main_process_port=12345 lmms_eval \ 55 | --model qwen2_vl \ 56 | --model_args=pretrained=${model_checkpoint},max_pixels=2359296 \ 57 | --tasks ${TASKS} \ 58 | --batch_size 1 \ 59 | --log_samples \ 60 | --log_samples_suffix ${MODEL_BASENAME} \ 61 | --output_path ./logs -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/train_qwen2_vl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export NCCL_BLOCKING_WAIT=0 4 | export TOKENIZERS_PARALLELISM=false 5 | export OMP_NUM_THREADS=8 6 | export NCCL_IB_DISABLE=0 7 | export NCCL_IB_GID_INDEX=3 8 | export NCCL_SOCKET_IFNAME=eth0 9 | export NCCL_DEBUG=INFO 10 | 11 | GPUS="0,1,2,3,4,5,6,7" 12 | 13 | # 取 worker0 第一个 port 14 | ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' ')) 15 | port=${ports[0]} 16 | port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2000}" | awk -F',' '{print $1}')" 17 | 18 | echo "total workers: ${ARNOLD_WORKER_NUM}" 19 | echo "cur worker id: ${ARNOLD_ID}" 20 | echo "gpus per worker: ${ARNOLD_WORKER_GPU}" 21 | echo "master ip: ${METIS_WORKER_0_HOST}" 22 | echo "master port: ${port}" 23 | echo "master port in cmd: ${port_in_cmd}" 24 | 25 | # export WANDB_BASE_URL=https://api.wandb.ai 26 | # export WANDB_API_KEY="" 27 | # wandb login $WANDB_API_KEY 28 | 29 | export WANDB_BASE_URL=https://api.wandb.ai 30 | export WANDB_PROJECT=vision-reasoning 31 | export WANDB_API_KEY="" 32 | export WANDB_RUN_NAME=Qwen-VL-2B-GRPO-$(date +%Y-%m-%d-%H-%M-%S) 33 | wandb login $WANDB_API_KEY 34 | 35 | cd /home/tiger/multimodal-open-r1 36 | # pip3 install vllm==0.6.6.post1 37 | pip3 install -e ".[dev]" 38 | pip3 install wandb==0.18.3 39 | 40 | torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" \ 41 | --nnodes="${ARNOLD_WORKER_NUM}" \ 42 | --node_rank="${ARNOLD_ID}" \ 43 | --master_addr="${METIS_WORKER_0_HOST}" \ 44 | --master_port="${port_in_cmd}" \ 45 | src/open_r1/grpo.py \ 46 | --deepspeed scripts/zero3.json \ 47 | --output_dir checkpoints/${WANDB_RUN_NAME} \ 48 | --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \ 49 | --dataset_name luodian/${DATASET_NAME} \ 50 | --max_prompt_length 8192 \ 51 | --per_device_train_batch_size 1 \ 52 | --gradient_accumulation_steps 1 \ 53 | --logging_steps 1 \ 54 | --bf16 \ 55 | --report_to wandb \ 56 | --gradient_checkpointing true \ 57 | --attn_implementation flash_attention_2 \ 58 | --max_pixels 2359296 \ 59 | --save_total_limit 8 \ 60 | --num_train_epochs 1 \ 61 | --run_name $WANDB_RUN_NAME 62 | -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/zero1_no_optimizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 1, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e9, 6 | "overlap_comm": false, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "gradient_accumulation_steps": "auto", 24 | "gradient_clipping": "auto", 25 | "steps_per_print": 1, 26 | "train_batch_size": "auto", 27 | "train_micro_batch_size_per_gpu": "auto", 28 | "wall_clock_breakdown": true 29 | } -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": false, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | 14 | "zero_optimization": { 15 | "stage": 3, 16 | "offload_optimizer": { 17 | "device": "none", 18 | "pin_memory": false 19 | }, 20 | "offload_param": { 21 | "device": "none", 22 | "pin_memory": false 23 | }, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /R1-V/src/r1-v/local_scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "cpu", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "train_batch_size": "auto", 45 | "train_micro_batch_size_per_gpu": "auto", 46 | "steps_per_print": 1e5, 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /R1-V/src/r1-v/run_grpo.sh: -------------------------------------------------------------------------------- 1 | cd src/r1-v 2 | 3 | export DEBUG_MODE="true" 4 | export LOG_PATH="./debug_log_2b.txt" 5 | 6 | 7 | 8 | torchrun --nproc_per_node="8" \ 9 | --nnodes="1" \ 10 | --node_rank="0" \ 11 | --master_addr="127.0.0.1" \ 12 | --master_port="12345" \ 13 | src/open_r1/grpo.py \ 14 | --output_dir \ 15 | --model_name_or_path \ 16 | --dataset_name \ 17 | --max_prompt_length 1024 \ 18 | --per_device_train_batch_size 1 \ 19 | --gradient_accumulation_steps 2 \ 20 | --logging_steps 1 \ 21 | --bf16 \ 22 | --report_to wandb \ 23 | --gradient_checkpointing false \ 24 | --attn_implementation flash_attention_2 \ 25 | --max_pixels 401408 \ 26 | --num_train_epochs 2 \ 27 | --run_name Qwen2-VL-2B-GRPO-CLEVR-70k \ 28 | --save_steps 100 \ 29 | --save_only_model true -------------------------------------------------------------------------------- /R1-V/src/r1-v/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /R1-V/src/r1-v/src/open_r1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/src/r1-v/src/open_r1/__init__.py -------------------------------------------------------------------------------- /R1-V/src/r1-v/src/open_r1/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .grpo_trainer import Qwen2VLGRPOTrainer 2 | from .vllm_grpo_trainer import Qwen2VLGRPOVLLMTrainer 3 | from .vllm_grpo_trainer_modified import Qwen2VLGRPOVLLMTrainerModified 4 | 5 | __all__ = [ 6 | "Qwen2VLGRPOTrainer", 7 | "Qwen2VLGRPOVLLMTrainer", 8 | "Qwen2VLGRPOVLLMTrainerModified" 9 | ] 10 | -------------------------------------------------------------------------------- /R1-V/src/r1-v/temp_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/R1-V/src/r1-v/temp_image.png -------------------------------------------------------------------------------- /R1-V/src/scripts/run_grpo_GEOQA_qwen2.5_3b.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | pg_name="gpg" 4 | adjust_gd="true" 5 | min_inverse_alpha="0.4" 6 | 7 | # Wandb 8 | export WANDB_PROJECT="R1-V" 9 | 10 | DATA_PATH=leonardPKU/GEOQA_R1V_Train_8K 11 | CKPT_PATH=Qwen2.5-VL-3B-Instruct 12 | 13 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 14 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 15 | 16 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 17 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 18 | 19 | mkdir -p ${SAVE_PATH} 20 | 21 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \ 22 | --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \ 23 | src/r1-v/src/open_r1/grpo.py \ 24 | --output_dir ${SAVE_PATH} \ 25 | --model_name_or_path ${CKPT_PATH} \ 26 | --dataset_name ${DATA_PATH} \ 27 | --deepspeed src/r1-v/local_scripts/zero3.json \ 28 | --pg_name ${pg_name} \ 29 | --adjust_gd ${adjust_gd} \ 30 | --min_inverse_alpha ${min_inverse_alpha} \ 31 | --max_prompt_length 1024 \ 32 | --max_completion_length 256 \ 33 | --per_device_train_batch_size 1 \ 34 | --gradient_accumulation_steps 2 \ 35 | --logging_steps 1 \ 36 | --bf16 \ 37 | --report_to wandb \ 38 | --gradient_checkpointing false \ 39 | --attn_implementation flash_attention_2 \ 40 | --max_pixels 401408 \ 41 | --num_train_epochs 1 \ 42 | --run_name "${RUN_NAME}" \ 43 | --save_steps 100 \ 44 | --save_only_model true \ 45 | --num_generations 8 \ 46 | --learning_rate 1e-6 \ 47 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 48 | -------------------------------------------------------------------------------- /R1-V/test.sh: -------------------------------------------------------------------------------- 1 | export MASTER_ADDR=127.0.0.1 2 | export MASTER_PORT=21231 3 | export WORLD_SIZE=1 4 | export RANK=0 5 | export GPUS=2 6 | 7 | timestamp=$(date "+%Y%m%d%H%M%S") 8 | 9 | OMP_NUM_THREADS=4 bash ./scripts/run_grpo_clevr.sh ${timestamp} 10 | -------------------------------------------------------------------------------- /Visual-RFT/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | output/ 3 | output_onlypg/ 4 | output_grpo/ -------------------------------------------------------------------------------- /Visual-RFT/assets/case_cls.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/case_cls.png -------------------------------------------------------------------------------- /Visual-RFT/assets/case_lisa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/case_lisa.png -------------------------------------------------------------------------------- /Visual-RFT/assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/framework.png -------------------------------------------------------------------------------- /Visual-RFT/assets/pokeymon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/pokeymon.jpg -------------------------------------------------------------------------------- /Visual-RFT/assets/radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/radar.png -------------------------------------------------------------------------------- /Visual-RFT/assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/assets/teaser.png -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/fgvc_aircraft.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/fgvc_aircraft.pth -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/fgvc_aircraft.txt: -------------------------------------------------------------------------------- 1 | 707-320 2 | 727-200 3 | 737-200 4 | 737-300 5 | 737-400 6 | 737-500 7 | 737-600 8 | 737-700 9 | 737-800 10 | 737-900 11 | 747-100 12 | 747-200 13 | 747-300 14 | 747-400 15 | 757-200 16 | 757-300 17 | 767-200 18 | 767-300 19 | 767-400 20 | 777-200 21 | 777-300 22 | A300B4 23 | A310 24 | A318 25 | A319 26 | A320 27 | A321 28 | A330-200 29 | A330-300 30 | A340-200 31 | A340-300 32 | A340-500 33 | A340-600 34 | A380 35 | ATR-42 36 | ATR-72 37 | An-12 38 | BAE 146-200 39 | BAE 146-300 40 | BAE-125 41 | Beechcraft 1900 42 | Boeing 717 43 | C-130 44 | C-47 45 | CRJ-200 46 | CRJ-700 47 | CRJ-900 48 | Cessna 172 49 | Cessna 208 50 | Cessna 525 51 | Cessna 560 52 | Challenger 600 53 | DC-10 54 | DC-3 55 | DC-6 56 | DC-8 57 | DC-9-30 58 | DH-82 59 | DHC-1 60 | DHC-6 61 | DHC-8-100 62 | DHC-8-300 63 | DR-400 64 | Dornier 328 65 | E-170 66 | E-190 67 | E-195 68 | EMB-120 69 | ERJ 135 70 | ERJ 145 71 | Embraer Legacy 600 72 | Eurofighter Typhoon 73 | F-16A/B 74 | F/A-18 75 | Falcon 2000 76 | Falcon 900 77 | Fokker 100 78 | Fokker 50 79 | Fokker 70 80 | Global Express 81 | Gulfstream IV 82 | Gulfstream V 83 | Hawk T1 84 | Il-76 85 | L-1011 86 | MD-11 87 | MD-80 88 | MD-87 89 | MD-90 90 | Metroliner 91 | Model B200 92 | PA-28 93 | SR-20 94 | Saab 2000 95 | Saab 340 96 | Spitfire 97 | Tornado 98 | Tu-134 99 | Tu-154 100 | Yak-42 -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/oxford_flowers.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/oxford_flowers.pth -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/oxford_flowers.txt: -------------------------------------------------------------------------------- 1 | pink primrose 2 | hard-leaved pocket orchid 3 | canterbury bells 4 | sweet pea 5 | english marigold 6 | tiger lily 7 | moon orchid 8 | bird of paradise 9 | monkshood 10 | globe thistle 11 | snapdragon 12 | colts foot 13 | king protea 14 | spear thistle 15 | yellow iris 16 | globe-flower 17 | purple coneflower 18 | peruvian lily 19 | balloon flower 20 | giant white arum lily 21 | fire lily 22 | pincushion flower 23 | fritillary 24 | red ginger 25 | grape hyacinth 26 | corn poppy 27 | prince of wales feathers 28 | stemless gentian 29 | artichoke 30 | sweet william 31 | carnation 32 | garden phlox 33 | love in the mist 34 | mexican aster 35 | alpine sea holly 36 | ruby-lipped cattleya 37 | cape flower 38 | great masterwort 39 | siam tulip 40 | lenten rose 41 | barbeton daisy 42 | daffodil 43 | sword lily 44 | poinsettia 45 | bolero deep blue 46 | wallflower 47 | marigold 48 | buttercup 49 | oxeye daisy 50 | common dandelion 51 | petunia 52 | wild pansy 53 | primula 54 | sunflower 55 | pelargonium 56 | bishop of llandaff 57 | gaura 58 | geranium 59 | orange dahlia 60 | pink-yellow dahlia 61 | cautleya spicata 62 | japanese anemone 63 | black-eyed susan 64 | silverbush 65 | californian poppy 66 | osteospermum 67 | spring crocus 68 | bearded iris 69 | windflower 70 | tree poppy 71 | gazania 72 | azalea 73 | water lily 74 | rose 75 | thorn apple 76 | morning glory 77 | passion flower 78 | lotus 79 | toad lily 80 | anthurium 81 | frangipani 82 | clematis 83 | hibiscus 84 | columbine 85 | desert-rose 86 | tree mallow 87 | magnolia 88 | cyclamen 89 | watercress 90 | canna lily 91 | hippeastrum 92 | bee balm 93 | ball moss 94 | foxglove 95 | bougainvillea 96 | camellia 97 | mallow 98 | mexican petunia 99 | bromelia 100 | blanket flower 101 | trumpet creeper 102 | blackberry lily -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/pets.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/pets.pth -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/pets.txt: -------------------------------------------------------------------------------- 1 | abyssinian 2 | american_bulldog 3 | american_pit_bull_terrier 4 | basset_hound 5 | beagle 6 | bengal 7 | birman 8 | bombay 9 | boxer 10 | british_shorthair 11 | chihuahua 12 | egyptian_mau 13 | english_cocker_spaniel 14 | english_setter 15 | german_shorthaired 16 | great_pyrenees 17 | havanese 18 | japanese_chin 19 | keeshond 20 | leonberger 21 | maine_coon 22 | miniature_pinscher 23 | newfoundland 24 | persian 25 | pomeranian 26 | pug 27 | ragdoll 28 | russian_blue 29 | saint_bernard 30 | samoyed 31 | scottish_terrier 32 | shiba_inu 33 | siamese 34 | sphynx 35 | staffordshire_bull_terrier 36 | wheaten_terrier 37 | yorkshire_terrier -------------------------------------------------------------------------------- /Visual-RFT/classification/val_data/stanford_cars.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/classification/val_data/stanford_cars.pth -------------------------------------------------------------------------------- /Visual-RFT/dataset/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Visual-RFT/lisa_evaluation/Qwen2_VL_lisa_infere.sh: -------------------------------------------------------------------------------- 1 | TASKS=("test" "val") 2 | # Adjust to your gpu num 3 | # GPU_IDS=(0 1 2 3 4 5 6 7) 4 | # SPLIT_NUM=8 5 | GPU_IDS=(0 1) 6 | SPLIT_NUM=2 7 | 8 | for task in "${TASKS[@]}"; do 9 | echo "Starting inference for task: $task" 10 | 11 | # 遍历 GPU 和 SPLIT 12 | for i in "${!GPU_IDS[@]}"; do 13 | GPU_ID=${GPU_IDS[$i]} 14 | SPLIT=$i 15 | echo "Launching task=$task on GPU=$GPU_ID with SPLIT=$SPLIT" 16 | SPLIT=$SPLIT SPLIT_NUM=$SPLIT_NUM python Qwen2_VL_lisa_infere.py \ 17 | --task $task & 18 | sleep 1 19 | done 20 | wait 21 | echo "Merging results for task: $task" 22 | SPLIT_NUM=$SPLIT_NUM python merge_eval.py >> res.txt 23 | done 24 | 25 | echo "All tasks completed!" 26 | -------------------------------------------------------------------------------- /Visual-RFT/lisa_evaluation/README.md: -------------------------------------------------------------------------------- 1 | ## ViRFT for reasoning grounding 2 | 3 | ## training 4 | 1. Download [LISA dataset](https://github.com/dvlab-research/LISA) 5 | 2. use `gen_box_ann.py` to generate box from mask. 6 | 3. use `gen_sft.py` to generate SFT/Visual-RFT training annotations. 7 | 4. use `src/scripts/2B_lisa_grounding.sh` to train the model, with annotation path changed to step.3 generated annotations. 8 | 9 | After training model, replace model path in `Qwen2_VL_lisa_infere.py` with your own ckpt. 10 | 11 | ```python 12 | # Load Qwen2-VL-2B model and processor 13 | model = Qwen2VLForConditionalGeneration.from_pretrained( 14 | "/path/to/your/checkpoint-498", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2" 15 | ).eval() 16 | 17 | processor = AutoProcessor.from_pretrained("/path/to/your/checkpoint-498") 18 | ``` 19 | 20 | to compute gIoU, follow the process bellow. 21 | 1. Use `box2mask.py` to extract mask from [SAM](https://github.com/facebookresearch/segment-anything) 22 | 2. Use `mask_iou` to comput mask IoU. 23 | 24 | ```shell 25 | cd lisa_evaluation 26 | bash Qwen2_VL_lisa_infere.sh 27 | ``` 28 | -------------------------------------------------------------------------------- /Visual-RFT/lisa_evaluation/gen_box_ann.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from PIL import Image, ImageDraw 4 | 5 | res = [] 6 | # base_path = "dataset/reason_seg/ReasonSeg/train" 7 | # base_path = "dataset/reason_seg/ReasonSeg/val" 8 | base_path = "dataset/reason_seg/ReasonSeg/test" 9 | 10 | for pth in os.listdir(base_path): 11 | if pth.endswith(".json"): 12 | json_path = os.path.join(base_path, pth) 13 | 14 | with open(json_path, 'r') as f: 15 | item = json.load(f) 16 | 17 | instruct = item["text"] 18 | shapes = item["shapes"] 19 | 20 | boxes = [] 21 | for shape in shapes[:1]: 22 | points = shape["points"] 23 | x_coords = [p[0] for p in points] 24 | y_coords = [p[1] for p in points] 25 | 26 | x_min, x_max = min(x_coords), max(x_coords) 27 | y_min, y_max = min(y_coords), max(y_coords) 28 | boxes.append((x_min, y_min, x_max, y_max)) 29 | 30 | img_path = json_path.replace(".json", ".jpg") 31 | if os.path.exists(img_path): 32 | res.append({ 33 | "image_path": img_path, 34 | "instruction": instruct, 35 | "boxes": boxes 36 | }) 37 | 38 | # json.dump(res, open("lisa_train.json", 'w'), indent=4) 39 | # json.dump(res, open("lisa_val.json", 'w'), indent=4) 40 | json.dump(res, open("lisa_test.json", 'w'), indent=4) 41 | -------------------------------------------------------------------------------- /Visual-RFT/lisa_evaluation/gen_sft.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from PIL import Image 4 | res = [] 5 | index = 0 6 | for i, item in enumerate(json.load(open("./lisa_train.json", 'r'))): 7 | for instruct in item['instruction']: 8 | w, h= Image.open(item['image_path']).size 9 | res.append({ 10 | "id": f"lisa_{index}", 11 | "conversations": [ 12 | { 13 | "from": "user", 14 | "value": f"{item['image_path']}\n Output the bounding box in the image corresponding to the instruction: {instruct}" 15 | }, 16 | { 17 | "from": "assistant", 18 | "value": f"({int(item['boxes'][0][0] / w * 1000)},{int(item['boxes'][0][1] / h * 1000)}),({int(item['boxes'][0][2] / w * 1000)},{int(item['boxes'][0][3] / h * 1000)})" 19 | } 20 | ] 21 | }) 22 | index += 1 23 | json.dump(res, open("lisa_train_sft.json", 'w'), indent=4) 24 | -------------------------------------------------------------------------------- /Visual-RFT/lisa_evaluation/merge_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | merged = [] 4 | for i in range(int(os.environ['SPLIT_NUM'])): 5 | data = json.load(open(f"tmp/res_{i}.json", 'r')) 6 | merged += data 7 | print(f"mIoU: {sum(merged) / len(merged)}") 8 | -------------------------------------------------------------------------------- /Visual-RFT/q&a.md: -------------------------------------------------------------------------------- 1 | 2 | 训练时问题,解决方式: 3 | https://github.com/Liuziyu77/Visual-RFT/issues/17#issuecomment-2702690782 -------------------------------------------------------------------------------- /Visual-RFT/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=1.2.1 2 | bitsandbytes>=0.43.0 3 | black>=24.4.2 4 | datasets>=3.2.0 5 | deepspeed==0.15.4 6 | distilabel[vllm,ray,openai]>=1.5.2 7 | einops>=0.8.0 8 | flake8>=6.0.0 9 | hf_transfer>=0.1.4 10 | huggingface-hub[cli]>=0.19.2,<1.0 11 | isort>=5.12.0 12 | liger_kernel==0.5.2 13 | # lighteval @ git+https://githubfast.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math] 14 | math-verify 15 | packaging>=23.0 16 | parameterized>=0.9.0 17 | pytest 18 | safetensors>=0.3.3 19 | sentencepiece>=0.1.99 20 | torch>=2.5.1 21 | transformers @ git+https://github.com/huggingface/transformers.git@main 22 | trl @ git+https://github.com/huggingface/trl.git@main 23 | vllm==0.6.6.post1 -------------------------------------------------------------------------------- /Visual-RFT/setup.sh: -------------------------------------------------------------------------------- 1 | cd src/virft 2 | pip install -e ".[dev]" 3 | 4 | # Addtional modules 5 | pip install wandb==0.18.3 6 | pip install tensorboardx 7 | pip install qwen_vl_utils torchvision 8 | # pip install flash-attn --no-build-isolation 9 | pip install flash-attn==2.6.3 --no-build-isolation 10 | 11 | # vLLM support 12 | pip install vllm==0.7.2 13 | 14 | # fix transformers version 15 | pip install git+https://github.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef 16 | -------------------------------------------------------------------------------- /Visual-RFT/src/scripts/2B_aircraft_4_shot.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | # pg_name="grpo" 4 | pg_name="gpg" 5 | 6 | adjust_gd="true" 7 | 8 | # Wandb 9 | export WANDB_PROJECT="visual-rft" 10 | 11 | DATA_PATH=laolao77/ViRFT_CLS_fgvc_aircraft_4_shot 12 | CKPT_PATH=Qwen2-VL-2B-Instruct 13 | 14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 16 | 17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 19 | 20 | mkdir -p ${SAVE_PATH} 21 | 22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \ 23 | --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \ 24 | src/virft/src/open_r1/grpo_classification.py \ 25 | --output_dir ${SAVE_PATH} \ 26 | --model_name_or_path ${CKPT_PATH} \ 27 | --dataset_name ${DATA_PATH} \ 28 | --deepspeed src/virft/local_scripts/zero3.json \ 29 | --pg_name ${pg_name} \ 30 | --adjust_gd ${adjust_gd} \ 31 | --temperature 0.9 \ 32 | --max_prompt_length 1024 \ 33 | --per_device_train_batch_size 1 \ 34 | --gradient_accumulation_steps 2 \ 35 | --logging_steps 1 \ 36 | --bf16 \ 37 | --report_to wandb \ 38 | --gradient_checkpointing false \ 39 | --attn_implementation flash_attention_2 \ 40 | --max_pixels 401408 \ 41 | --num_train_epochs 8 \ 42 | --run_name "${RUN_NAME}" \ 43 | --save_steps 100 \ 44 | --save_only_model true \ 45 | --num_generations 8 \ 46 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 47 | -------------------------------------------------------------------------------- /Visual-RFT/src/scripts/2B_car196_4_shot.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | # pg_name="grpo" 4 | pg_name="gpg" 5 | 6 | adjust_gd="true" 7 | 8 | # Wandb 9 | export WANDB_PROJECT="visual-rft" 10 | 11 | DATA_PATH=laolao77/ViRFT_CLS_car196_4shot 12 | CKPT_PATH=Qwen2-VL-2B-Instruct 13 | 14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 16 | 17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 19 | 20 | mkdir -p ${SAVE_PATH} 21 | 22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \ 23 | --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \ 24 | src/virft/src/open_r1/grpo_classification.py \ 25 | --output_dir ${SAVE_PATH} \ 26 | --model_name_or_path ${CKPT_PATH} \ 27 | --dataset_name ${DATA_PATH} \ 28 | --deepspeed src/virft/local_scripts/zero3.json \ 29 | --pg_name ${pg_name} \ 30 | --adjust_gd ${adjust_gd} \ 31 | --temperature 0.9 \ 32 | --max_prompt_length 1024 \ 33 | --per_device_train_batch_size 1 \ 34 | --gradient_accumulation_steps 2 \ 35 | --logging_steps 1 \ 36 | --bf16 \ 37 | --report_to wandb \ 38 | --gradient_checkpointing false \ 39 | --attn_implementation flash_attention_2 \ 40 | --max_pixels 401408 \ 41 | --num_train_epochs 8 \ 42 | --run_name "${RUN_NAME}" \ 43 | --save_steps 100 \ 44 | --save_only_model true \ 45 | --num_generations 8 \ 46 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 47 | -------------------------------------------------------------------------------- /Visual-RFT/src/scripts/2B_flower_4_shot.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | # pg_name="grpo" 4 | pg_name="gpg" 5 | 6 | adjust_gd="true" 7 | 8 | # Wandb 9 | export WANDB_PROJECT="visual-rft" 10 | 11 | DATA_PATH=laolao77/ViRFT_CLS_flower_4_shot 12 | CKPT_PATH=Qwen2-VL-2B-Instruct 13 | 14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 16 | 17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 19 | 20 | mkdir -p ${SAVE_PATH} 21 | 22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \ 23 | --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \ 24 | src/virft/src/open_r1/grpo_classification.py \ 25 | --output_dir ${SAVE_PATH} \ 26 | --model_name_or_path ${CKPT_PATH} \ 27 | --dataset_name ${DATA_PATH} \ 28 | --deepspeed src/virft/local_scripts/zero3.json \ 29 | --pg_name ${pg_name} \ 30 | --adjust_gd ${adjust_gd} \ 31 | --temperature 0.9 \ 32 | --max_prompt_length 1024 \ 33 | --per_device_train_batch_size 1 \ 34 | --gradient_accumulation_steps 2 \ 35 | --logging_steps 1 \ 36 | --bf16 \ 37 | --report_to wandb \ 38 | --gradient_checkpointing false \ 39 | --attn_implementation flash_attention_2 \ 40 | --max_pixels 401408 \ 41 | --num_train_epochs 8 \ 42 | --run_name "${RUN_NAME}" \ 43 | --save_steps 100 \ 44 | --save_only_model true \ 45 | --num_generations 8 \ 46 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 47 | -------------------------------------------------------------------------------- /Visual-RFT/src/scripts/2B_lisa_grounding.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | # pg_name="grpo" 4 | pg_name="gpg" 5 | 6 | adjust_gd="true" 7 | 8 | # Wandb 9 | export WANDB_PROJECT="visual-rft" 10 | 11 | DATA_PATH=NOT_USED 12 | CKPT_PATH=Qwen2-VL-2B-Instruct 13 | 14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 16 | 17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 19 | 20 | mkdir -p ${SAVE_PATH} 21 | 22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \ 23 | --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \ 24 | src/virft/src/open_r1/grpo_lisa.py \ 25 | --output_dir ${SAVE_PATH} \ 26 | --model_name_or_path ${CKPT_PATH} \ 27 | --dataset_name NOT_USED \ 28 | --deepspeed src/virft/local_scripts/zero3.json \ 29 | --pg_name ${pg_name} \ 30 | --adjust_gd ${adjust_gd} \ 31 | --temperature 0.9 \ 32 | --max_prompt_length 1024 \ 33 | --per_device_train_batch_size 1 \ 34 | --gradient_accumulation_steps 2 \ 35 | --logging_steps 1 \ 36 | --bf16 \ 37 | --report_to wandb \ 38 | --gradient_checkpointing true \ 39 | --attn_implementation flash_attention_2 \ 40 | --max_pixels 401408 \ 41 | --num_train_epochs 6 \ 42 | --run_name "${RUN_NAME}" \ 43 | --save_steps 50 \ 44 | --save_only_model true \ 45 | --num_generations 8 \ 46 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 47 | -------------------------------------------------------------------------------- /Visual-RFT/src/scripts/2B_pets37_4_shot.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | # pg_name="grpo" 4 | pg_name="gpg" 5 | 6 | adjust_gd="true" 7 | 8 | # Wandb 9 | export WANDB_PROJECT="visual-rft" 10 | 11 | DATA_PATH=laolao77/ViRFT_CLS_pets37_4shot 12 | CKPT_PATH=Qwen2-VL-2B-Instruct 13 | 14 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 15 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 16 | 17 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 18 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 19 | 20 | mkdir -p ${SAVE_PATH} 21 | 22 | torchrun --master_addr ${MASTER_ADDR} --master-port ${MASTER_PORT} \ 23 | --nnodes ${WORLD_SIZE} --node_rank ${RANK} --nproc-per-node=${GPUS} \ 24 | src/virft/src/open_r1/grpo_classification.py \ 25 | --output_dir ${SAVE_PATH} \ 26 | --model_name_or_path ${CKPT_PATH} \ 27 | --dataset_name ${DATA_PATH} \ 28 | --deepspeed src/virft/local_scripts/zero3.json \ 29 | --pg_name ${pg_name} \ 30 | --adjust_gd ${adjust_gd} \ 31 | --temperature 0.9 \ 32 | --max_prompt_length 1024 \ 33 | --per_device_train_batch_size 1 \ 34 | --gradient_accumulation_steps 2 \ 35 | --logging_steps 1 \ 36 | --bf16 \ 37 | --report_to wandb \ 38 | --gradient_checkpointing false \ 39 | --attn_implementation flash_attention_2 \ 40 | --max_pixels 401408 \ 41 | --num_train_epochs 24 \ 42 | --run_name "${RUN_NAME}" \ 43 | --save_steps 100 \ 44 | --save_only_model true \ 45 | --num_generations 8 \ 46 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 47 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style quality 2 | 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) 4 | export PYTHONPATH = src 5 | 6 | check_dirs := src 7 | 8 | style: 9 | black --line-length 119 --target-version py310 $(check_dirs) setup.py 10 | isort $(check_dirs) setup.py 11 | 12 | quality: 13 | black --check --line-length 119 --target-version py310 $(check_dirs) setup.py 14 | isort --check-only $(check_dirs) setup.py 15 | flake8 --max-line-length 119 $(check_dirs) setup.py 16 | 17 | 18 | # Evaluation 19 | 20 | evaluate: 21 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/README.md: -------------------------------------------------------------------------------- 1 | # Visual-RFT 2 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /Visual-RFT/src/virft/configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/local_scripts/train_qwen2_vl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export NCCL_BLOCKING_WAIT=0 4 | export TOKENIZERS_PARALLELISM=false 5 | export OMP_NUM_THREADS=8 6 | export NCCL_IB_DISABLE=0 7 | export NCCL_IB_GID_INDEX=3 8 | export NCCL_SOCKET_IFNAME=eth0 9 | export NCCL_DEBUG=INFO 10 | 11 | GPUS="0,1,2,3,4,5,6,7" 12 | 13 | # 取 worker0 第一个 port 14 | ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' ')) 15 | port=${ports[0]} 16 | port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2000}" | awk -F',' '{print $1}')" 17 | 18 | echo "total workers: ${ARNOLD_WORKER_NUM}" 19 | echo "cur worker id: ${ARNOLD_ID}" 20 | echo "gpus per worker: ${ARNOLD_WORKER_GPU}" 21 | echo "master ip: ${METIS_WORKER_0_HOST}" 22 | echo "master port: ${port}" 23 | echo "master port in cmd: ${port_in_cmd}" 24 | 25 | # export WANDB_BASE_URL=https://api.wandb.ai 26 | # export WANDB_API_KEY="" 27 | # wandb login $WANDB_API_KEY 28 | 29 | export WANDB_BASE_URL=https://api.wandb.ai 30 | export WANDB_PROJECT=vision-reasoning 31 | export WANDB_API_KEY="" 32 | export WANDB_RUN_NAME=Qwen-VL-2B-GRPO-$(date +%Y-%m-%d-%H-%M-%S) 33 | wandb login $WANDB_API_KEY 34 | 35 | cd /home/tiger/multimodal-open-r1 36 | # pip3 install vllm==0.6.6.post1 37 | pip3 install -e ".[dev]" 38 | pip3 install wandb==0.18.3 39 | 40 | torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" \ 41 | --nnodes="${ARNOLD_WORKER_NUM}" \ 42 | --node_rank="${ARNOLD_ID}" \ 43 | --master_addr="${METIS_WORKER_0_HOST}" \ 44 | --master_port="${port_in_cmd}" \ 45 | src/open_r1/grpo.py \ 46 | --deepspeed scripts/zero3.json \ 47 | --output_dir checkpoints/${WANDB_RUN_NAME} \ 48 | --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \ 49 | --dataset_name luodian/${DATASET_NAME} \ 50 | --max_prompt_length 8192 \ 51 | --per_device_train_batch_size 1 \ 52 | --gradient_accumulation_steps 1 \ 53 | --logging_steps 1 \ 54 | --bf16 \ 55 | --report_to wandb \ 56 | --gradient_checkpointing true \ 57 | --attn_implementation flash_attention_2 \ 58 | --max_pixels 2359296 \ 59 | --save_total_limit 8 \ 60 | --num_train_epochs 1 \ 61 | --run_name $WANDB_RUN_NAME 62 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/local_scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": false, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /Visual-RFT/src/virft/local_scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | 14 | "zero_optimization": { 15 | "stage": 3, 16 | "offload_optimizer": { 17 | "device": "none", 18 | "pin_memory": true 19 | }, 20 | "offload_param": { 21 | "device": "none", 22 | "pin_memory": true 23 | }, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /Visual-RFT/src/virft/local_scripts/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/local_scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "cpu", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "train_batch_size": "auto", 45 | "train_micro_batch_size_per_gpu": "auto", 46 | "steps_per_print": 1e5, 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /Visual-RFT/src/virft/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /Visual-RFT/src/virft/slurm/evaluate.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=open-r1-evaluate 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --exclusive 6 | #SBATCH --gres=gpu:8 7 | #SBATCH --partition=hopper-prod 8 | #SBATCH --time=01:59:00 9 | #SBATCH --output=./logs/evaluate/%x-%j.out 10 | #SBATCH --err=./logs/evaluate/%x-%j.err 11 | 12 | # Usage: sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B aime24 13 | 14 | set -x -e 15 | 16 | source ~/.bashrc 17 | conda activate openr1 18 | module load cuda/12.1 19 | echo "START TIME: $(date)" 20 | echo "PYTHON ENV: $(which python)" 21 | 22 | 23 | NUM_GPUS=8 24 | MODEL=$1 25 | TASK=$2 26 | MODEL_ARGS="pretrained=$MODEL,dtype=float16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8" 27 | OUTPUT_DIR=data/evals/$MODEL 28 | 29 | 30 | # force crashing on nccl issues like hanging broadcast 31 | export NCCL_ASYNC_ERROR_HANDLING=1 32 | # export NCCL_DEBUG=INFO 33 | # export NCCL_DEBUG_SUBSYS=COLL 34 | # export NCCL_SOCKET_NTHREADS=1 35 | # export NCCL_NSOCKS_PERTHREAD=1 36 | # export CUDA_LAUNCH_BLOCKING=1 37 | 38 | # Specific configuration optimized for the Hugging Face Compute Cluster 39 | # Be ye warned this may not work on other clusters! 40 | module load cuda/12.1 41 | 42 | lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \ 43 | --custom-tasks src/open_r1/evaluate.py \ 44 | --use-chat-template \ 45 | --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \ 46 | --output-dir $OUTPUT_DIR 47 | 48 | 49 | echo "END TIME: $(date)" 50 | -------------------------------------------------------------------------------- /Visual-RFT/src/virft/src/open_r1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/Visual-RFT/src/virft/src/open_r1/__init__.py -------------------------------------------------------------------------------- /Visual-RFT/src/virft/src/open_r1/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .grpo_trainer import Qwen2VLGRPOTrainer 2 | from .vllm_grpo_trainer import Qwen2VLGRPOVLLMTrainer 3 | 4 | __all__ = ["Qwen2VLGRPOTrainer", "Qwen2VLGRPOVLLMTrainer"] 5 | -------------------------------------------------------------------------------- /Visual-RFT/test.sh: -------------------------------------------------------------------------------- 1 | export MASTER_ADDR=127.0.0.1 2 | export MASTER_PORT=21231 3 | export WORLD_SIZE=1 4 | export RANK=0 5 | export GPUS=2 6 | 7 | timestamp=$(date "+%Y%m%d%H%M%S") 8 | 9 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_base65cate_6k.sh ${timestamp} 10 | 11 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_aircraft_4_shot.sh ${timestamp} 12 | 13 | OMP_NUM_THREADS=4 bash ./src/scripts/2B_lisa_grounding.sh ${timestamp} 14 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.tmp 3 | *.csv 4 | *.json 5 | *.parquet 6 | *.png 7 | *.jpg 8 | 9 | # dependency directories 10 | src/open-r1-multimodal/src/open_r1/__pycache__/ 11 | 12 | # Python cache 13 | __pycache__/ 14 | 15 | # Egg info 16 | *.egg-info/ 17 | 18 | # wandb 19 | src/open-r1-multimodal/wandb/ 20 | 21 | # folder 22 | src/open-r1-multimodal/trajectories/ 23 | 24 | # outputs 25 | src/open-r1-multimodal/output/ 26 | output 27 | trajectories 28 | model -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=1.2.1 2 | bitsandbytes>=0.43.0 3 | black>=24.4.2 4 | datasets>=3.2.0 5 | deepspeed==0.15.4 6 | distilabel[vllm,ray,openai]>=1.5.2 7 | einops>=0.8.0 8 | flake8>=6.0.0 9 | hf_transfer>=0.1.4 10 | huggingface-hub[cli]>=0.19.2,<1.0 11 | isort>=5.12.0 12 | liger_kernel==0.5.2 13 | # lighteval @ git+https://githubfast.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math] 14 | math-verify 15 | packaging>=23.0 16 | parameterized>=0.9.0 17 | pytest 18 | safetensors>=0.3.3 19 | sentencepiece>=0.1.99 20 | torch>=2.5.1 21 | # transformers @ git+https://githubfast.com/huggingface/transformers.git@main 22 | trl==0.14.0 23 | vllm==0.6.6.post1 24 | wandb>=0.19.1 25 | pillow 26 | timm -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/setup.sh: -------------------------------------------------------------------------------- 1 | conda create -n visual_thinker python=3.11 2 | conda activate visual_thinker 3 | 4 | # Install the packages in open-r1-multimodal . 5 | cd src/open-r1-multimodal 6 | pip install -e ".[dev]" 7 | 8 | # Addtional modules 9 | pip install wandb==0.18.3 10 | pip install tensorboardx tensorboard 11 | pip install qwen_vl_utils torchvision 12 | pip install flash-attn --no-build-isolation 13 | 14 | pip install transformers==4.49.0 # correct deepspeed support 15 | pip install duckdb 16 | pip install opencv-python 17 | pip install pandas 18 | pip install math_verify==0.5.2 19 | pip install datasets 20 | pip install accelerate 21 | pip install deepspeed 22 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/data/SAT/prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | # Download the dataset parquet and rename it 2 | wget -O SAT_train.parquet "https://hf-mirror.com/datasets/array/SAT/resolve/main/SAT_train.parquet?download=true" 3 | 4 | # Create the dataset directory 5 | mkdir -p SAT_images_train 6 | 7 | # Process the dataset 8 | python process_dataset.py -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style quality 2 | 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) 4 | export PYTHONPATH = src 5 | 6 | check_dirs := src 7 | 8 | style: 9 | black --line-length 119 --target-version py310 $(check_dirs) setup.py 10 | isort $(check_dirs) setup.py 11 | 12 | quality: 13 | black --check --line-length 119 --target-version py310 $(check_dirs) setup.py 14 | isort --check-only $(check_dirs) setup.py 15 | flake8 --max-line-length 119 $(check_dirs) setup.py 16 | 17 | 18 | # Evaluation 19 | 20 | evaluate: 21 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | # machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | # num_machines: 1 15 | # num_processes: 2 16 | # main_process_port: 44326 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 4 17 | main_process_port: 22316 18 | rdzv_backend: static 19 | same_network: true 20 | tpu_env: [] 21 | tpu_use_cluster: false 22 | tpu_use_sudo: false 23 | use_cpu: false 24 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/prepare_2B_base.sh: -------------------------------------------------------------------------------- 1 | # Prepare base model with chat template for SFT training 2 | git lfs install 3 | git clone https://huggingface.co/Qwen/Qwen2-VL-2B 4 | mv Qwen2-VL-2B Qwen2-VL-2B-Base 5 | 6 | huggingface-cli download Qwen/Qwen2-VL-2B-Instruct chat_template.json tokenizer_config.json --local-dir ./Qwen2-VL-2B-Base 7 | 8 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/run_grpo.sh: -------------------------------------------------------------------------------- 1 | export DEBUG_MODE="true" 2 | export LOG_PATH="./debug_log_2b.txt" 3 | 4 | 5 | 6 | torchrun --nproc_per_node="8" \ 7 | --nnodes="1" \ 8 | --node_rank="0" \ 9 | --master_addr="127.0.0.1" \ 10 | --master_port="12345" \ 11 | src/open_r1/grpo.py \ 12 | --output_dir \ 13 | --model_name_or_path \ 14 | --dataset_name \ 15 | --max_prompt_length 1024 \ 16 | --per_device_train_batch_size 1 \ 17 | --gradient_accumulation_steps 2 \ 18 | --logging_steps 1 \ 19 | --bf16 \ 20 | --report_to wandb \ 21 | --gradient_checkpointing false \ 22 | --attn_implementation flash_attention_2 \ 23 | --max_pixels 401408 \ 24 | --num_train_epochs 2 \ 25 | --run_name Qwen2-VL-2B-GRPO-CLEVR-70k \ 26 | --save_steps 100 \ 27 | --save_only_model true -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/run_grpo_SAT.sh: -------------------------------------------------------------------------------- 1 | timestamp=$1 2 | echo "timestamp: ${timestamp}" 3 | pg_name="gpg" 4 | adjust_gd="true" 5 | min_inverse_alpha="0.4" 6 | 7 | # Wandb 8 | export WANDB_PROJECT="VisualThinker-R1-Zero" 9 | 10 | DATA_PATH=SAT 11 | CKPT_PATH=Qwen2-VL-2B 12 | 13 | RUN_NAME=${DATA_PATH##*/}_${CKPT_PATH##*/}_${timestamp} 14 | SAVE_PATH="./output/${pg_name}/${RUN_NAME}" 15 | mkdir -p ${SAVE_PATH} 16 | export DEBUG_MODE="true" # Enable Debug if you want to see the rollout of model during RL 17 | export LOG_PATH="./${SAVE_PATH}/debug_log.txt" 18 | # export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $((GPUS-1)) | sed 's/,$//g') 19 | 20 | accelerate launch --config_file=src/open-r1-multimodal/configs/zero2.yaml \ 21 | --main_process_ip ${MASTER_ADDR} --main_process_port ${MASTER_PORT} \ 22 | --num_machines ${WORLD_SIZE} --machine_rank ${RANK} --num_processes ${GPUS} \ 23 | src/open-r1-multimodal/src/open_r1/grpo.py \ 24 | --pg_name ${pg_name} \ 25 | --adjust_gd ${adjust_gd} \ 26 | --min_inverse_alpha ${min_inverse_alpha} \ 27 | --output_dir ${SAVE_PATH} \ 28 | --model_name_or_path ${CKPT_PATH} \ 29 | --dataset_name ${DATA_PATH} \ 30 | --max_prompt_length 1024 \ 31 | --max_completion_length 700 \ 32 | --per_device_train_batch_size 1 \ 33 | --gradient_accumulation_steps 1 \ 34 | --logging_steps 1 \ 35 | --bf16 \ 36 | --gradient_checkpointing 1 \ 37 | --attn_implementation flash_attention_2 \ 38 | --max_pixels 401408 \ 39 | --num_train_epochs 2 \ 40 | --run_name ${RUN_NAME} \ 41 | --save_steps 100 \ 42 | --save_only_model true \ 43 | --report_to wandb \ 44 | 2>&1 | tee -a "./${SAVE_PATH}/training_log.log" 45 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/run_sft.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \ 4 | --model_name_or_path \ 5 | --dataset_name \ 6 | --learning_rate 2.0e-5 \ 7 | --num_train_epochs 2 \ 8 | --packing True \ 9 | --max_seq_length 1024 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 4 \ 12 | --gradient_accumulation_steps 2 \ 13 | --gradient_checkpointing True \ 14 | --report_to wandb \ 15 | --bf16 True \ 16 | --logging_steps 5 \ 17 | --eval_strategy no \ 18 | --output_dir \ 19 | --run_name -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/run_sft_SAT.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \ 4 | --model_name_or_path Qwen2-VL-2B-Base \ 5 | --dataset_name SAT \ 6 | --learning_rate 2.0e-5 \ 7 | --num_train_epochs 2 \ 8 | --packing True \ 9 | --max_seq_length 1024 \ 10 | --per_device_train_batch_size 1 \ 11 | --per_device_eval_batch_size 4 \ 12 | --gradient_accumulation_steps 2 \ 13 | --gradient_checkpointing True \ 14 | --report_to wandb \ 15 | --bf16 True \ 16 | --logging_steps 5 \ 17 | --eval_strategy no \ 18 | --save_steps 300 \ 19 | --output_dir outputs/Qwen2_VL-2B-SFT \ 20 | --run_name Qwen2_VL-2B-SFT-SAT -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/src/open_r1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/VisualThinker-R1-Zero/src/open-r1-multimodal/src/open_r1/__init__.py -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/src/open-r1-multimodal/src/open_r1/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .grpo_trainer import Qwen2VLGRPOTrainer 2 | 3 | 4 | __all__ = ["Qwen2VLGRPOTrainer"] 5 | -------------------------------------------------------------------------------- /VisualThinker-R1-Zero/test.sh: -------------------------------------------------------------------------------- 1 | export MASTER_ADDR=127.0.0.1 2 | export MASTER_PORT=21232 3 | export WORLD_SIZE=1 4 | export RANK=0 5 | export GPUS=2 6 | 7 | timestamp=$(date "+%Y%m%d%H%M%S") 8 | 9 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_base65cate_6k.sh ${timestamp} 10 | 11 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_aircraft_4_shot.sh ${timestamp} 12 | 13 | # OMP_NUM_THREADS=4 bash ./src/scripts/2B_lisa_grounding.sh ${timestamp} 14 | OMP_NUM_THREADS=4 bash ./src/open-r1-multimodal/run_grpo_SAT.sh ${timestamp} 15 | -------------------------------------------------------------------------------- /docs/images/GPG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/docs/images/GPG.png -------------------------------------------------------------------------------- /open-r1/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | - package-ecosystem: "github-actions" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | -------------------------------------------------------------------------------- /open-r1/.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - v*-release 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | 14 | tests: 15 | name: Run tests and quality checks 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | - name: Setup Python environment 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: 3.10.10 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install ".[quality,tests]" 28 | - name: Code quality 29 | run: | 30 | make quality 31 | - name: Run tests 32 | run: | 33 | make test 34 | 35 | -------------------------------------------------------------------------------- /open-r1/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style quality 2 | 3 | # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) 4 | export PYTHONPATH = src 5 | 6 | check_dirs := src tests 7 | 8 | 9 | # dev dependencies 10 | install: 11 | uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip 12 | uv pip install vllm==0.7.2 13 | uv pip install setuptools 14 | uv pip install flash-attn --no-build-isolation 15 | GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]" 16 | 17 | style: 18 | ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py 19 | isort $(check_dirs) setup.py 20 | 21 | quality: 22 | ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py 23 | isort --check-only $(check_dirs) setup.py 24 | flake8 --max-line-length 119 $(check_dirs) setup.py 25 | 26 | test: 27 | pytest -sv --ignore=tests/slow/ tests/ 28 | 29 | slow_test: 30 | pytest -sv -vv tests/slow/ 31 | 32 | # Evaluation 33 | 34 | evaluate: 35 | $(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \ 36 | if [ "$(PARALLEL)" = "data" ]; then \ 37 | echo "data_parallel_size=$(NUM_GPUS)"; \ 38 | elif [ "$(PARALLEL)" = "tensor" ]; then \ 39 | echo "tensor_parallel_size=$(NUM_GPUS)"; \ 40 | fi \ 41 | ),)) 42 | $(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \ 43 | MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \ 44 | if [ "$(TASK)" = "lcb" ]; then \ 45 | lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \ 46 | --use-chat-template \ 47 | --output-dir data/evals/$(MODEL); \ 48 | else \ 49 | lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \ 50 | --custom-tasks src/open_r1/evaluate.py \ 51 | --use-chat-template \ 52 | --output-dir data/evals/$(MODEL); \ 53 | fi 54 | -------------------------------------------------------------------------------- /open-r1/assets/plan-of-attack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/open-r1/assets/plan-of-attack.png -------------------------------------------------------------------------------- /open-r1/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml: -------------------------------------------------------------------------------- 1 | # To start the training, run the following command: 2 | # sbatch -N 4 --job-name=mistral_sft slurm/train.slurm Mistral-Small-24B-Instruct-2501 sft numina zero3 3 | 4 | model_name_or_path: mistralai/Mistral-Small-24B-Instruct-2501 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | # dataset_name: yentinglin/s1K-1.1-trl-format 11 | dataset_name: yentinglin/OpenR1-Math-220k-trl-format 12 | preprocessing_num_workers: 8 13 | 14 | # SFT trainer config 15 | bf16: true 16 | do_eval: true 17 | eval_strategy: no 18 | gradient_accumulation_steps: 4 19 | gradient_checkpointing: true 20 | gradient_checkpointing_kwargs: 21 | use_reentrant: false 22 | hub_model_id: Mistral-Small-24B-Instruct-2501-Open-R1-Distill 23 | hub_strategy: every_save 24 | learning_rate: 2.0e-05 25 | log_level: info 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine 29 | packing: true 30 | max_length: 32768 31 | max_steps: -1 32 | num_train_epochs: 5 33 | output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill 34 | overwrite_output_dir: true 35 | per_device_eval_batch_size: 1 36 | per_device_train_batch_size: 1 37 | push_to_hub: true 38 | report_to: 39 | - wandb 40 | save_strategy: epoch 41 | seed: 42 42 | warmup_ratio: 0.1 43 | -------------------------------------------------------------------------------- /open-r1/recipes/OlympicCoder-32B/sft/config_v00.00.yaml: -------------------------------------------------------------------------------- 1 | # Config for 16 nodes of 8 H100s with FSDP1 2 | # Model arguments 3 | model_name_or_path: Qwen/Qwen2.5-Coder-32B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/codeforces-cots 10 | dataset_config: solutions_decontaminated 11 | dataset_num_proc: 12 12 | 13 | # SFT trainer config 14 | bf16: true 15 | do_eval: false 16 | eval_strategy: 'no' 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_always_push: true 22 | hub_model_id: OlympicCoder-32B 23 | hub_strategy: every_save 24 | learning_rate: 4.0e-05 25 | log_level: info 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine_with_min_lr 29 | lr_scheduler_kwargs: 30 | min_lr_rate: 0.1 31 | packing: false 32 | max_grad_norm: 0.2 33 | max_length: 22528 # we were unable to train at 32k due to OOM. See https://github.com/huggingface/transformers/issues/35983 for context parallelism support. 34 | max_steps: -1 35 | num_train_epochs: 10 36 | optim: paged_adamw_8bit 37 | output_dir: data/OlympicCoder-32B 38 | overwrite_output_dir: true 39 | per_device_eval_batch_size: 1 40 | per_device_train_batch_size: 1 41 | push_to_hub: true 42 | report_to: 43 | - wandb 44 | save_only_model: true # needed to bypass FSDP errors with saving paged optimizers 45 | save_strategy: epoch 46 | save_total_limit: 1 47 | seed: 42 48 | use_liger: false # fails on multi-node 49 | warmup_ratio: 0.03 -------------------------------------------------------------------------------- /open-r1/recipes/OlympicCoder-7B/sft/config_v00.00.yaml: -------------------------------------------------------------------------------- 1 | # Config for 1 node of 8 H100s with DeepSpeed ZeRO-3 2 | # Model arguments 3 | model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/codeforces-cots 10 | dataset_config: solutions_decontaminated 11 | dataset_num_proc: 48 12 | 13 | # SFT trainer config 14 | bf16: true 15 | do_eval: false 16 | eval_strategy: 'no' 17 | gradient_accumulation_steps: 8 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: open-r1/OlympicCoder-7B 22 | hub_strategy: every_save 23 | learning_rate: 1.0e-05 24 | log_level: info 25 | logging_steps: 1 26 | logging_strategy: steps 27 | lr_scheduler_type: cosine_with_min_lr 28 | lr_scheduler_kwargs: 29 | min_lr_rate: 0.1 30 | packing: false 31 | max_grad_norm: 0.2 32 | max_length: 32768 33 | max_steps: -1 34 | num_train_epochs: 10 35 | output_dir: data/OlympicCoder-7B 36 | overwrite_output_dir: true 37 | per_device_eval_batch_size: 1 38 | per_device_train_batch_size: 2 39 | push_to_hub: true 40 | report_to: 41 | - wandb 42 | save_strategy: epoch 43 | save_total_limit: 1 44 | seed: 42 45 | use_liger: true 46 | warmup_ratio: 0.03 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v0.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: data/OpenR1-Qwen-7B-SFT-2nodes/checkpoint-1611 #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: true 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 2 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 7 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GPG-7B 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 2 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v0_ds.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: true 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 2 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 7 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GPG-Qwen-7B-DS 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 2 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: data/OpenR1-Qwen-7B-SFT-2nodes/checkpoint-1611 #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: true 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 2 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 7 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GPG-7B-wo-std 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 2 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 57 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v1_ds.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: true 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 2 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: constant_with_warmup 32 | max_prompt_length: 512 33 | max_completion_length: 4096 34 | num_generations: 6 35 | num_train_epochs: 1 36 | output_dir: data/OpenRS-GPG-Qwen-7B-DS-lr 37 | overwrite_output_dir: true 38 | per_device_eval_batch_size: 6 39 | per_device_train_batch_size: 2 40 | push_to_hub: false 41 | report_to: 42 | - tensorboard 43 | reward_funcs: 44 | - format_v2 45 | - cosine 46 | reward_weights: 47 | - 1.0 48 | - 2.0 49 | save_strategy: "steps" 50 | save_steps: 50 51 | seed: 42 52 | temperature: 0.7 53 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/gpg/config_v2_ds.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 1 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: constant_with_warmup 32 | max_prompt_length: 512 33 | max_completion_length: 3584 34 | num_generations: 8 35 | num_train_epochs: 1 36 | output_dir: data/OpenRS-GPG-Qwen-7B-DS-v2 37 | overwrite_output_dir: true 38 | per_device_eval_batch_size: 6 39 | per_device_train_batch_size: 2 40 | push_to_hub: false 41 | report_to: 42 | - tensorboard 43 | reward_funcs: 44 | - format_v2 45 | - accuracy 46 | reward_weights: 47 | - 0.2 48 | - 1.0 49 | save_strategy: "steps" 50 | save_steps: 50 51 | seed: 42 52 | temperature: 0.7 53 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/sft/config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json 4 | model_name_or_path: Qwen/Qwen2.5-Math-7B-Instruct 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: sdpa 8 | 9 | # Data training arguments 10 | dataset_name: open-r1/OpenR1-Math-220k 11 | dataset_num_proc: 48 12 | 13 | #SFT hyperparam 14 | max_length: 32768 15 | weight_decay: 0.0001 16 | optim: adamw_torch 17 | lr_scheduler_type: linear 18 | warmup_ratio: 0.1 19 | learning_rate: 5.0e-05 20 | gradient_accumulation_steps: 2 21 | per_device_eval_batch_size: 1 22 | per_device_train_batch_size: 1 23 | 24 | # SFT trainer config 25 | max_steps: -1 26 | num_train_epochs: 3 27 | bf16: true 28 | do_eval: false 29 | use_liger_kernel: true 30 | eval_strategy: 'no' 31 | gradient_checkpointing: true 32 | gradient_checkpointing_kwargs: 33 | use_reentrant: false 34 | hub_model_id: OpenR1-Qwen-7B-SFT 35 | hub_strategy: every_save 36 | log_level: info 37 | logging_steps: 5 38 | logging_strategy: steps 39 | packing: true 40 | output_dir: data/OpenR1-Qwen-7B-SFT 41 | overwrite_output_dir: true 42 | push_to_hub: true 43 | report_to: 44 | - wandb 45 | save_strategy: "steps" 46 | save_steps: 500 47 | save_total_limit: 1 48 | seed: 42 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/sft/config_v0.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json 4 | model_name_or_path: models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: sdpa 8 | 9 | # Data training arguments 10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k 11 | dataset_num_proc: 48 12 | 13 | #SFT hyperparam 14 | max_length: 32768 15 | weight_decay: 0.0001 16 | optim: adamw_torch 17 | lr_scheduler_type: linear 18 | warmup_ratio: 0.1 19 | learning_rate: 5.0e-05 20 | gradient_accumulation_steps: 1 21 | per_device_eval_batch_size: 1 22 | per_device_train_batch_size: 1 23 | 24 | # SFT trainer config 25 | max_steps: -1 26 | num_train_epochs: 3 27 | bf16: true 28 | do_eval: false 29 | use_liger: false 30 | use_liger_kernel: false 31 | eval_strategy: 'no' 32 | gradient_checkpointing: true 33 | gradient_checkpointing_kwargs: 34 | use_reentrant: false 35 | hub_model_id: OpenR1-Qwen-7B-SFT 36 | hub_strategy: every_save 37 | log_level: info 38 | logging_steps: 5 39 | logging_strategy: steps 40 | packing: true 41 | output_dir: data/OpenR1-Qwen-7B-SFT 42 | overwrite_output_dir: true 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | save_strategy: "steps" 47 | save_steps: 500 48 | save_total_limit: 3 49 | seed: 42 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/sft/config_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json 4 | model_name_or_path: models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: sdpa 8 | 9 | # Data training arguments 10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k 11 | dataset_num_proc: 48 12 | 13 | #SFT hyperparam 14 | max_length: 32768 15 | weight_decay: 0.0001 16 | optim: adamw_torch 17 | lr_scheduler_type: linear 18 | warmup_ratio: 0.1 19 | learning_rate: 1e-04 20 | gradient_accumulation_steps: 1 21 | per_device_eval_batch_size: 1 22 | per_device_train_batch_size: 1 23 | 24 | # SFT trainer config 25 | max_steps: -1 26 | num_train_epochs: 3 27 | bf16: true 28 | do_eval: false 29 | use_liger: false 30 | use_liger_kernel: false 31 | eval_strategy: 'no' 32 | gradient_checkpointing: true 33 | gradient_checkpointing_kwargs: 34 | use_reentrant: false 35 | hub_model_id: OpenR1-Qwen-7B-SFT 36 | hub_strategy: every_save 37 | log_level: info 38 | logging_steps: 5 39 | logging_strategy: steps 40 | packing: true 41 | output_dir: data/OpenR1-Qwen-7B-SFT-2nodes 42 | overwrite_output_dir: true 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | save_strategy: "steps" 47 | save_steps: 500 48 | save_total_limit: 3 49 | seed: 42 -------------------------------------------------------------------------------- /open-r1/recipes/OpenR1-Qwen-7B/sft/config_v2.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json 4 | model_name_or_path: models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: sdpa 8 | 9 | # Data training arguments 10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k 11 | dataset_num_proc: 48 12 | 13 | #SFT hyperparam 14 | max_length: 32768 15 | weight_decay: 0.0001 16 | optim: adamw_torch 17 | lr_scheduler_type: linear 18 | warmup_ratio: 0.1 19 | learning_rate: 5e-05 20 | gradient_accumulation_steps: 1 21 | per_device_eval_batch_size: 1 22 | per_device_train_batch_size: 1 23 | 24 | # SFT trainer config 25 | max_steps: -1 26 | num_train_epochs: 3 27 | bf16: true 28 | do_eval: false 29 | use_liger: false 30 | use_liger_kernel: false 31 | eval_strategy: 'no' 32 | gradient_checkpointing: true 33 | gradient_checkpointing_kwargs: 34 | use_reentrant: false 35 | hub_model_id: OpenR1-Qwen-7B-SFT 36 | hub_strategy: every_save 37 | log_level: info 38 | logging_steps: 5 39 | logging_strategy: steps 40 | packing: true 41 | output_dir: data/OpenR1-Qwen-7B-SFT-1nodes 42 | overwrite_output_dir: true 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | save_strategy: "steps" 47 | save_steps: 500 48 | save_total_limit: 6 49 | seed: 42 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 2.0e-05 24 | log_completions: false 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: cosine 30 | max_prompt_length: 512 31 | max_completion_length: 1024 32 | max_steps: -1 33 | # change gen to 8, so that we can train faster. 34 | num_generations: 8 35 | num_train_epochs: 1 36 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-WSTD 37 | overwrite_output_dir: true 38 | per_device_eval_batch_size: 16 39 | per_device_train_batch_size: 16 40 | push_to_hub: false 41 | report_to: 42 | - tensorboard 43 | reward_funcs: 44 | - accuracy 45 | - format 46 | - tag_count 47 | reward_weights: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | #save_strategy: "epoch" 52 | #save_total_limit: 1 53 | save_strategy: "steps" 54 | save_steps: 200 55 | save_total_limit: 7 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v2.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 2.0e-05 24 | log_completions: false 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: cosine 30 | max_prompt_length: 512 31 | max_completion_length: 1024 32 | max_steps: -1 33 | # change gen to 8, so that we can train faster. 34 | num_generations: 8 35 | num_train_epochs: 1 36 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-a1f1 37 | overwrite_output_dir: true 38 | per_device_eval_batch_size: 16 39 | per_device_train_batch_size: 16 40 | push_to_hub: false 41 | report_to: 42 | - tensorboard 43 | reward_funcs: 44 | - accuracy 45 | - format 46 | reward_weights: 47 | - 1.0 48 | - 1.0 49 | #save_strategy: "epoch" 50 | #save_total_limit: 1 51 | save_strategy: "steps" 52 | save_steps: 200 53 | save_total_limit: 7 54 | seed: 42 55 | warmup_ratio: 0.1 56 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v3.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 2.0e-05 24 | log_completions: false 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: cosine 30 | max_prompt_length: 512 31 | max_completion_length: 1024 32 | max_steps: -1 33 | # change gen to 8, so that we can train faster. 34 | num_generations: 8 35 | num_train_epochs: 1 36 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v3 37 | overwrite_output_dir: true 38 | remove_unused_columns: False # avoid failure of parsing gold. 39 | per_device_eval_batch_size: 16 40 | per_device_train_batch_size: 16 41 | push_to_hub: false 42 | report_to: 43 | - tensorboard 44 | reward_funcs: 45 | - accuracy 46 | - format 47 | reward_weights: 48 | - 1.0 49 | - 0.2 50 | #save_strategy: "epoch" 51 | #save_total_limit: 1 52 | save_strategy: "steps" 53 | save_steps: 200 54 | save_total_limit: 7 55 | seed: 42 56 | warmup_ratio: 0.1 57 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v4.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 2 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 1.0e-06 24 | log_completions: false 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: constant_with_warmup 30 | max_grad_norm: 0.2 31 | max_prompt_length: 512 32 | max_completion_length: 2048 #1024 33 | max_steps: -1 34 | # change gen to 8, so that we can train faster. 35 | num_generations: 8 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v4-increase-2048 38 | overwrite_output_dir: true 39 | remove_unused_columns: False # avoid failure of parsing gold. 40 | per_device_eval_batch_size: 16 41 | per_device_train_batch_size: 8 42 | push_to_hub: false 43 | report_to: 44 | - tensorboard 45 | reward_funcs: 46 | - accuracy 47 | - format 48 | reward_weights: 49 | - 1.0 50 | - 0.2 51 | #save_strategy: "epoch" 52 | #save_total_limit: 1 53 | save_strategy: "steps" 54 | save_steps: 100 55 | save_total_limit: 10 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_v5.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 1.0e-06 24 | log_completions: false 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: constant_with_warmup 30 | max_grad_norm: 0.2 31 | max_prompt_length: 512 32 | max_completion_length: 1024 33 | max_steps: -1 34 | # change gen to 8, so that we can train faster. 35 | num_generations: 8 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v5 38 | overwrite_output_dir: true 39 | remove_unused_columns: False # avoid failure of parsing gold. 40 | per_device_eval_batch_size: 16 41 | per_device_train_batch_size: 16 42 | push_to_hub: false 43 | report_to: 44 | - tensorboard 45 | reward_funcs: 46 | - accuracy 47 | - format 48 | reward_weights: 49 | - 1.0 50 | - 0.1 51 | #save_strategy: "epoch" 52 | #save_total_limit: 1 53 | save_strategy: "steps" 54 | save_steps: 100 55 | save_total_limit: 10 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_demo_woSTD.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 1.0e-06 24 | log_completions: false 25 | log_level: info 26 | logging_first_step: true 27 | logging_steps: 1 28 | logging_strategy: steps 29 | lr_scheduler_type: constant_with_warmup 30 | max_grad_norm: 0.2 31 | max_prompt_length: 512 32 | max_completion_length: 1024 33 | max_steps: -1 34 | # change gen to 8, so that we can train faster. 35 | num_generations: 8 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GPG-v5-WoSTD 38 | overwrite_output_dir: true 39 | remove_unused_columns: False # avoid failure of parsing gold. 40 | per_device_eval_batch_size: 16 41 | per_device_train_batch_size: 16 42 | push_to_hub: false 43 | report_to: 44 | - tensorboard 45 | reward_funcs: 46 | - accuracy 47 | - format 48 | reward_weights: 49 | - 1.0 50 | - 0.1 51 | #save_strategy: "epoch" 52 | #save_total_limit: 1 53 | save_strategy: "steps" 54 | save_steps: 100 55 | save_total_limit: 10 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_v0.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 1 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/Qwen2.5-1.5B-OpenRS-GPG-v2 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format_v2 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_v0_nostd.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/ #datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 1 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/Qwen2.5-1.5B-OpenRS-GPG-wostd 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 57 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/gpg/config_v0_open22k.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B #deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ #knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 1 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: false 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/Qwen2.5-1.5B-OpenRS-GPG-open220k 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-220k 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: true 14 | do_eval: false 15 | gradient_accumulation_steps: 4 16 | gradient_checkpointing: true 17 | gradient_checkpointing_kwargs: 18 | use_reentrant: false 19 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 20 | hub_strategy: every_save 21 | learning_rate: 2.0e-05 22 | log_completions: true 23 | log_level: info 24 | logging_first_step: true 25 | logging_steps: 1 26 | logging_strategy: steps 27 | lr_scheduler_type: cosine 28 | max_prompt_length: 512 29 | max_completion_length: 1024 30 | max_steps: -1 31 | num_generations: 16 32 | num_train_epochs: 1 33 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO 34 | overwrite_output_dir: true 35 | per_device_eval_batch_size: 16 36 | per_device_train_batch_size: 16 37 | push_to_hub: true 38 | report_to: 39 | - wandb 40 | reward_funcs: 41 | - accuracy 42 | - format 43 | - tag_count 44 | reward_weights: 45 | - 1.0 46 | - 1.0 47 | - 1.0 48 | save_strategy: "epoch" 49 | save_total_limit: 1 50 | seed: 42 51 | warmup_ratio: 0.1 52 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/verifiable-coding-problems-python 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | beta: 0.01 13 | bf16: true 14 | use_vllm: true 15 | do_eval: false 16 | gradient_accumulation_steps: 4 17 | gradient_checkpointing: true 18 | gradient_checkpointing_kwargs: 19 | use_reentrant: false 20 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO 21 | hub_strategy: every_save 22 | learning_rate: 5.0e-06 23 | log_completions: true 24 | log_level: info 25 | logging_first_step: true 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine_with_min_lr 29 | lr_scheduler_kwargs: 30 | min_lr_rate: 0.1 31 | max_prompt_length: 1024 32 | max_completion_length: 2048 33 | max_steps: 500 34 | num_generations: 14 35 | num_train_epochs: 1 36 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO 37 | overwrite_output_dir: true 38 | per_device_train_batch_size: 16 39 | push_to_hub: true 40 | report_to: 41 | - wandb 42 | reward_funcs: 43 | - code 44 | - format 45 | reward_weights: 46 | - 1.0 47 | - 0.1 48 | save_strategy: "steps" 49 | save_steps: 50 50 | save_total_limit: 1 51 | seed: 42 52 | temperature: 1.0 53 | warmup_ratio: 0.03 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/ioi 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | beta: 0.01 13 | bf16: true 14 | use_vllm: true 15 | do_eval: false 16 | gradient_accumulation_steps: 4 17 | gradient_checkpointing: true 18 | gradient_checkpointing_kwargs: 19 | use_reentrant: false 20 | hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO 21 | hub_strategy: every_save 22 | learning_rate: 5.0e-06 23 | log_completions: true 24 | log_level: info 25 | logging_first_step: true 26 | logging_steps: 1 27 | logging_strategy: steps 28 | lr_scheduler_type: cosine_with_min_lr 29 | lr_scheduler_kwargs: 30 | min_lr_rate: 0.1 31 | max_prompt_length: 1024 32 | max_completion_length: 2048 33 | max_steps: 500 34 | num_generations: 14 35 | num_train_epochs: 1 36 | output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO 37 | overwrite_output_dir: true 38 | per_device_train_batch_size: 16 39 | push_to_hub: true 40 | report_to: 41 | - wandb 42 | save_strategy: "steps" 43 | save_steps: 50 44 | save_total_limit: 1 45 | seed: 42 46 | temperature: 1.0 47 | warmup_ratio: 0.03 48 | # ioi specific config 49 | code_language: cpp 50 | reward_funcs: 51 | - ioi_code 52 | - code_format 53 | - format 54 | reward_weights: 55 | - 1.0 56 | - 0.1 57 | - 0.1 58 | # for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating 59 | # otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions 60 | code_eval_test_batch_size: 3 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 2.0e-05 24 | #disable log complete 25 | log_completions: false 26 | log_level: info 27 | logging_first_step: true 28 | logging_steps: 1 29 | logging_strategy: steps 30 | lr_scheduler_type: cosine 31 | max_prompt_length: 512 32 | max_completion_length: 1024 33 | max_steps: -1 34 | # change gen to 8, so that we can train faster. 35 | num_generations: 8 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO 38 | overwrite_output_dir: true 39 | per_device_eval_batch_size: 16 40 | per_device_train_batch_size: 16 41 | push_to_hub: false 42 | report_to: 43 | - tensorboard 44 | reward_funcs: 45 | - accuracy 46 | - format 47 | - tag_count 48 | reward_weights: 49 | - 1.0 50 | - 1.0 51 | - 1.0 52 | #save_strategy: "epoch" 53 | #save_total_limit: 1 54 | save_strategy: "steps" 55 | save_steps: 200 56 | save_total_limit: 7 57 | seed: 42 58 | warmup_ratio: 0.1 59 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_v3.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_gpu_memory_utilization: 0.7 16 | do_eval: false 17 | gradient_accumulation_steps: 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO 22 | hub_strategy: every_save 23 | learning_rate: 2.0e-05 24 | #disable log complete 25 | log_completions: false 26 | log_level: info 27 | logging_first_step: true 28 | logging_steps: 1 29 | logging_strategy: steps 30 | lr_scheduler_type: cosine 31 | max_prompt_length: 512 32 | max_completion_length: 1024 33 | max_steps: -1 34 | # change gen to 8, so that we can train faster. 35 | num_generations: 8 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO-v3 38 | overwrite_output_dir: true 39 | remove_unused_columns: False # avoid failure of parsing gold. 40 | per_device_eval_batch_size: 16 41 | per_device_train_batch_size: 16 42 | push_to_hub: false 43 | report_to: 44 | - tensorboard 45 | reward_funcs: 46 | - accuracy 47 | - format 48 | reward_weights: 49 | - 1.0 50 | - 0.2 51 | #save_strategy: "epoch" 52 | #save_total_limit: 1 53 | save_strategy: "steps" 54 | save_steps: 200 55 | save_total_limit: 7 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-220k 9 | dataset_num_proc: 48 10 | 11 | # SFT trainer config 12 | bf16: true 13 | do_eval: false 14 | eval_strategy: 'no' 15 | gradient_accumulation_steps: 1 16 | gradient_checkpointing: true 17 | gradient_checkpointing_kwargs: 18 | use_reentrant: false 19 | hub_model_id: Qwen2.5-1.5B-Open-R1-Distill 20 | hub_strategy: every_save 21 | learning_rate: 5.0e-05 22 | log_level: info 23 | logging_steps: 5 24 | logging_strategy: steps 25 | lr_scheduler_type: cosine_with_min_lr 26 | lr_scheduler_kwargs: 27 | min_lr_rate: 0.1 28 | packing: true 29 | max_length: 16384 30 | max_steps: -1 31 | num_train_epochs: 1 32 | output_dir: data/Qwen2.5-1.5B-Open-R1-Distill 33 | overwrite_output_dir: true 34 | per_device_eval_batch_size: 16 35 | per_device_train_batch_size: 16 36 | push_to_hub: true 37 | report_to: 38 | - wandb 39 | save_strategy: "steps" 40 | save_steps: 100 41 | save_total_limit: 1 42 | seed: 42 43 | use_liger: true 44 | warmup_ratio: 0.05 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/Qwen2.5-1.5B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/OpenR1-Math-220k/ 9 | dataset_num_proc: 48 10 | 11 | # SFT trainer config 12 | bf16: true 13 | do_eval: false 14 | eval_strategy: 'no' 15 | #eval_steps: 200 16 | 17 | gradient_accumulation_steps: 2 # 1 18 | gradient_checkpointing: true 19 | gradient_checkpointing_kwargs: 20 | use_reentrant: false 21 | hub_model_id: Qwen2.5-1.5B-Open-R1-Distill 22 | hub_strategy: every_save 23 | learning_rate: 5.0e-05 24 | log_level: info 25 | logging_steps: 5 26 | logging_strategy: steps 27 | lr_scheduler_type: cosine_with_min_lr 28 | lr_scheduler_kwargs: 29 | min_lr_rate: 0.1 30 | packing: true 31 | max_length: 16384 32 | max_steps: -1 33 | num_train_epochs: 1 34 | output_dir: data/Qwen2.5-1.5B-Open-SFT 35 | overwrite_output_dir: true 36 | per_device_eval_batch_size: 8 #16 37 | per_device_train_batch_size: 2 # default 16 38 | push_to_hub: false 39 | report_to: 40 | - tensorboard 41 | save_strategy: "steps" 42 | save_steps: 100 43 | save_total_limit: 5 44 | seed: 42 45 | use_liger: false #true 46 | warmup_ratio: 0.05 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-1.5B-Instruct/sft/config_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768 3 | # the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json 4 | model_name_or_path: models/Qwen2.5-1.5B-Instruct #models/Qwen2.5-Math-7B-Instruct #Qwen/Qwen2.5-Math-7B-Instruct 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | dataset_name: datas/OpenR1-Math-220k/ #open-r1/OpenR1-Math-220k 11 | dataset_num_proc: 48 12 | 13 | #SFT hyperparam 14 | max_length: 32768 15 | weight_decay: 0.0001 16 | optim: adamw_torch 17 | lr_scheduler_type: linear 18 | warmup_ratio: 0.1 19 | learning_rate: 1e-04 20 | gradient_accumulation_steps: 1 21 | per_device_eval_batch_size: 1 22 | per_device_train_batch_size: 1 23 | 24 | # SFT trainer config 25 | max_steps: -1 26 | num_train_epochs: 3 27 | bf16: true 28 | do_eval: false 29 | use_liger: false 30 | use_liger_kernel: false 31 | eval_strategy: 'no' 32 | gradient_checkpointing: true 33 | gradient_checkpointing_kwargs: 34 | use_reentrant: false 35 | hub_model_id: OpenR1-Qwen-1.5B-SFT 36 | hub_strategy: every_save 37 | log_level: info 38 | logging_steps: 5 39 | logging_strategy: steps 40 | packing: true 41 | output_dir: data/OpenR1-Qwen-1.5B-SFT 42 | overwrite_output_dir: true 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | save_strategy: "steps" 47 | save_steps: 500 48 | save_total_limit: 4 49 | seed: 42 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-7B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-cn_k12-86k 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | beta: 0.001 13 | bf16: true 14 | do_eval: false 15 | eval_strategy: "no" 16 | use_vllm: true 17 | do_eval: false 18 | gradient_accumulation_steps: 16 19 | gradient_checkpointing: true 20 | gradient_checkpointing_kwargs: 21 | use_reentrant: false 22 | hub_model_id: Qwen2.5-7B-Instruct-GRPO 23 | hub_strategy: every_save 24 | learning_rate: 1.0e-06 25 | log_completions: true 26 | log_level: info 27 | logging_first_step: true 28 | logging_steps: 1 29 | logging_strategy: steps 30 | lr_scheduler_type: constant_with_warmup 31 | max_grad_norm: 0.2 32 | max_prompt_length: 1024 33 | max_completion_length: 4096 34 | max_steps: -1 35 | num_generations: 16 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-7B-Instruct-GRPO 38 | overwrite_output_dir: true 39 | per_device_train_batch_size: 4 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | reward_funcs: 44 | - accuracy 45 | - format 46 | reward_weights: 47 | - 1.0 48 | - 0.2 49 | save_strategy: "steps" 50 | save_steps: 0.1 51 | save_total_limit: 1 52 | seed: 42 53 | temperature: 0.7 54 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-7B-Instruct/grpo/config_demo_v1.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-7B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: open-r1/OpenR1-Math-cn_k12-86k 9 | system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" 10 | 11 | # GRPO trainer config 12 | beta: 0.001 13 | bf16: true 14 | do_eval: false 15 | eval_strategy: "no" 16 | use_vllm: true 17 | do_eval: false 18 | gradient_accumulation_steps: 16 19 | gradient_checkpointing: true 20 | gradient_checkpointing_kwargs: 21 | use_reentrant: false 22 | hub_model_id: Qwen2.5-7B-Instruct-GRPO 23 | hub_strategy: every_save 24 | learning_rate: 1.0e-06 25 | log_completions: true 26 | log_level: info 27 | logging_first_step: true 28 | logging_steps: 1 29 | logging_strategy: steps 30 | lr_scheduler_type: constant_with_warmup 31 | max_grad_norm: 0.2 32 | max_prompt_length: 1024 33 | max_completion_length: 4096 34 | max_steps: -1 35 | num_generations: 16 36 | num_train_epochs: 1 37 | output_dir: data/Qwen2.5-7B-Instruct-GRPO 38 | overwrite_output_dir: true 39 | per_device_train_batch_size: 4 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | reward_funcs: 44 | - accuracy 45 | - format 46 | reward_weights: 47 | - 1.0 48 | - 0.2 49 | save_strategy: "steps" 50 | save_steps: 0.1 51 | save_total_limit: 1 52 | seed: 42 53 | temperature: 0.7 54 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | bf16: true 17 | use_vllm: false 18 | do_eval: true 19 | eval_strategy: steps 20 | eval_steps: 100 21 | gradient_accumulation_steps: 4 22 | gradient_checkpointing: true 23 | gradient_checkpointing_kwargs: 24 | use_reentrant: false 25 | hub_model_id: Qwen-2.5-7B-Simple-RL 26 | hub_strategy: every_save 27 | learning_rate: 3.0e-06 28 | log_completions: false 29 | log_level: info 30 | logging_first_step: true 31 | logging_steps: 5 32 | logging_strategy: steps 33 | lr_scheduler_type: cosine 34 | max_prompt_length: 512 35 | max_completion_length: 1024 36 | max_steps: -1 37 | num_generations: 8 38 | num_train_epochs: 1 39 | output_dir: data/Qwen-2.5-7B-Simple-RL-TRPO 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 16 42 | per_device_train_batch_size: 16 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - accuracy 48 | - format 49 | reward_weights: 50 | - 1.0 51 | - 1.0 52 | save_strategy: "no" 53 | seed: 42 54 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_dgrpo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | do_eval: true 20 | eval_strategy: steps 21 | eval_steps: 100 22 | gradient_accumulation_steps: 4 23 | gradient_checkpointing: true 24 | gradient_checkpointing_kwargs: 25 | use_reentrant: false 26 | hub_model_id: Qwen-2.5-7B-Simple-RL 27 | hub_strategy: every_save 28 | learning_rate: 3.0e-06 29 | log_completions: false 30 | log_level: info 31 | logging_first_step: true 32 | logging_steps: 5 33 | logging_strategy: steps 34 | lr_scheduler_type: cosine 35 | max_prompt_length: 512 36 | max_completion_length: 1024 37 | max_steps: -1 38 | num_generations: 8 39 | num_train_epochs: 1 40 | output_dir: data/Qwen-2.5-7B-Simple-RL-DrTRPO 41 | overwrite_output_dir: true 42 | per_device_eval_batch_size: 16 43 | per_device_train_batch_size: 16 44 | push_to_hub: false 45 | report_to: 46 | - tensorboard 47 | reward_funcs: 48 | - accuracy 49 | - format 50 | reward_weights: 51 | - 1.0 52 | - 1.0 53 | save_strategy: "no" 54 | seed: 42 55 | warmup_ratio: 0.1 56 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | #system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please reason step by step, and put your final answer within \\boxed{}." 15 | 16 | #"<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n" 17 | # GRPO trainer config 18 | beta: 0.0 19 | bf16: true 20 | use_vllm: false 21 | vllm_device: auto 22 | vllm_gpu_memory_utilization: 0.7 23 | do_eval: true 24 | eval_strategy: epoch # steps 25 | #eval_steps: 100 26 | gradient_accumulation_steps: 4 27 | gradient_checkpointing: true 28 | gradient_checkpointing_kwargs: 29 | use_reentrant: false 30 | hub_model_id: Qwen-2.5-7B-Simple-RL 31 | hub_strategy: every_save 32 | learning_rate: 2.0e-06 33 | log_completions: false 34 | log_level: info 35 | logging_first_step: true 36 | logging_steps: 1 37 | logging_strategy: steps 38 | lr_scheduler_type: constant_with_warmup 39 | max_prompt_length: 512 40 | max_completion_length: 1024 41 | max_steps: -1 42 | num_generations: 8 43 | num_train_epochs: 1 44 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-E1 45 | overwrite_output_dir: true 46 | per_device_eval_batch_size: 16 47 | per_device_train_batch_size: 16 48 | push_to_hub: false 49 | report_to: 50 | - tensorboard 51 | reward_funcs: 52 | - accuracy 53 | reward_weights: 54 | - 1.0 55 | save_strategy: epoch 56 | save_total_limit: 1 57 | seed: 42 58 | warmup_ratio: 0.1 59 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_3k.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 3804 39 | max_steps: -1 40 | num_generations: 8 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-3804 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 2 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_3k_2nodes.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 2 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 3804 39 | max_steps: -1 40 | num_generations: 8 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-3804-2nodes 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 2 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n16.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 16 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n16 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n16_wostd.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 16 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n16-wostd 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n2.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 2 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n2 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n2_wostd.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 2 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n2-wostd 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n4.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 4 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n4 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n4_wostd.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 4 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n4-wostd 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_n8.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 8 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-n8 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | reward_weights: 52 | - 1.0 53 | save_strategy: "epoch" 54 | seed: 42 55 | warmup_ratio: 0.1 56 | 57 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_gpg_scale_batch.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 8 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-scale-batch 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | scale_batch: true 59 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5" 12 | dataset_config: "train.parquet" 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | # GRPO trainer config 15 | beta: 0.0 16 | bf16: true 17 | use_vllm: false 18 | vllm_device: auto 19 | vllm_gpu_memory_utilization: 0.7 20 | do_eval: true 21 | eval_strategy: steps 22 | eval_steps: 100 23 | gradient_accumulation_steps: 4 24 | gradient_checkpointing: true 25 | gradient_checkpointing_kwargs: 26 | use_reentrant: false 27 | hub_model_id: Qwen-2.5-7B-Simple-RL 28 | hub_strategy: every_save 29 | learning_rate: 3.0e-06 30 | log_completions: false 31 | log_level: info 32 | logging_first_step: true 33 | logging_steps: 5 34 | logging_strategy: steps 35 | lr_scheduler_type: cosine 36 | max_prompt_length: 512 37 | max_completion_length: 3000 38 | max_steps: -1 39 | num_generations: 8 40 | num_train_epochs: 1 41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35 42 | overwrite_output_dir: true 43 | per_device_eval_batch_size: 16 44 | per_device_train_batch_size: 16 45 | push_to_hub: false 46 | report_to: 47 | - tensorboard 48 | reward_funcs: 49 | - accuracy_lv35 50 | - format 51 | reward_weights: 52 | - 1.0 53 | - 1.0 54 | save_strategy: "no" 55 | seed: 42 56 | warmup_ratio: 0.1 57 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5" 12 | dataset_config: "train.parquet" 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | # GRPO trainer config 15 | beta: 0.0 16 | bf16: true 17 | use_vllm: false 18 | vllm_device: auto 19 | vllm_gpu_memory_utilization: 0.7 20 | do_eval: true 21 | eval_strategy: steps 22 | eval_steps: 33 23 | gradient_accumulation_steps: 1 24 | gradient_checkpointing: true 25 | gradient_checkpointing_kwargs: 26 | use_reentrant: false 27 | hub_model_id: Qwen-2.5-7B-Simple-RL 28 | hub_strategy: every_save 29 | learning_rate: 1.0e-06 30 | log_completions: false 31 | log_level: info 32 | logging_first_step: true 33 | logging_steps: 1 34 | logging_strategy: steps 35 | lr_scheduler_type: constant_with_warmup 36 | max_prompt_length: 1024 37 | max_completion_length: 3000 38 | max_steps: -1 39 | num_generations: 8 40 | num_train_epochs: 3 41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1 42 | overwrite_output_dir: true 43 | per_device_eval_batch_size: 16 44 | per_device_train_batch_size: 8 45 | push_to_hub: false 46 | report_to: 47 | - tensorboard 48 | reward_funcs: 49 | - accuracy_lv35 50 | reward_weights: 51 | - 1.0 52 | save_strategy: "steps" 53 | save_steps: 33 54 | seed: 42 55 | warmup_ratio: 0.03 56 | temperature: 1.0 57 | top_p : 1.0 58 | 59 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5" 12 | dataset_config: "train.parquet" 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | # GRPO trainer config 15 | beta: 0.0 16 | bf16: true 17 | use_vllm: false 18 | vllm_device: auto 19 | vllm_gpu_memory_utilization: 0.7 20 | do_eval: true 21 | eval_strategy: steps 22 | eval_steps: 100 23 | gradient_accumulation_steps: 1 24 | gradient_checkpointing: true 25 | gradient_checkpointing_kwargs: 26 | use_reentrant: false 27 | hub_model_id: Qwen-2.5-7B-Simple-RL 28 | hub_strategy: every_save 29 | learning_rate: 1.0e-06 30 | log_completions: false 31 | log_level: info 32 | logging_first_step: true 33 | logging_steps: 5 34 | logging_strategy: steps 35 | lr_scheduler_type: constant_with_warmup 36 | max_prompt_length: 1024 37 | max_completion_length: 3000 38 | max_steps: -1 39 | num_generations: 8 40 | num_train_epochs: 1 41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2 42 | overwrite_output_dir: true 43 | per_device_eval_batch_size: 16 44 | per_device_train_batch_size: 4 45 | push_to_hub: false 46 | report_to: 47 | - tensorboard 48 | reward_funcs: 49 | - accuracy_lv35 50 | - format 51 | reward_weights: 52 | - 1.0 53 | - 1.0 54 | save_strategy: "no" 55 | seed: 42 56 | warmup_ratio: 0.03 57 | temperature: 1.0 58 | top_p : 1.0 59 | scale_rewards: false 60 | 61 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_g16.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5" 12 | dataset_config: "train.parquet" 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | # GRPO trainer config 15 | beta: 0.0 16 | bf16: true 17 | use_vllm: false 18 | vllm_device: auto 19 | vllm_gpu_memory_utilization: 0.7 20 | do_eval: true 21 | eval_strategy: steps 22 | eval_steps: 100 23 | gradient_accumulation_steps: 1 24 | gradient_checkpointing: true 25 | gradient_checkpointing_kwargs: 26 | use_reentrant: false 27 | hub_model_id: Qwen-2.5-7B-Simple-RL 28 | hub_strategy: every_save 29 | learning_rate: 1.0e-06 30 | log_completions: false 31 | log_level: info 32 | logging_first_step: true 33 | logging_steps: 5 34 | logging_strategy: steps 35 | lr_scheduler_type: constant_with_warmup 36 | max_prompt_length: 1024 37 | max_completion_length: 3000 38 | max_steps: -1 39 | num_generations: 16 40 | num_train_epochs: 1 41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2-g16 42 | overwrite_output_dir: true 43 | per_device_eval_batch_size: 16 44 | per_device_train_batch_size: 8 45 | push_to_hub: false 46 | report_to: 47 | - tensorboard 48 | reward_funcs: 49 | - accuracy_lv35 50 | - format 51 | reward_weights: 52 | - 1.0 53 | - 1.0 54 | save_strategy: "no" 55 | seed: 42 56 | warmup_ratio: 0.03 57 | temperature: 1.0 58 | top_p : 1.0 59 | scale_rewards: false 60 | 61 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_v1.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 8 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_v1_kl.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | bf16: true 17 | use_vllm: false 18 | vllm_device: auto 19 | vllm_gpu_memory_utilization: 0.7 20 | do_eval: true 21 | eval_strategy: steps 22 | eval_steps: 100 23 | gradient_accumulation_steps: 4 24 | gradient_checkpointing: true 25 | gradient_checkpointing_kwargs: 26 | use_reentrant: false 27 | hub_model_id: Qwen-2.5-7B-Simple-RL 28 | hub_strategy: every_save 29 | learning_rate: 3.0e-06 30 | log_completions: false 31 | log_level: info 32 | logging_first_step: true 33 | logging_steps: 5 34 | logging_strategy: steps 35 | lr_scheduler_type: cosine 36 | max_prompt_length: 512 37 | max_completion_length: 1024 38 | max_steps: -1 39 | num_generations: 8 40 | num_train_epochs: 1 41 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-beta04 42 | overwrite_output_dir: true 43 | per_device_eval_batch_size: 16 44 | per_device_train_batch_size: 16 45 | push_to_hub: false 46 | report_to: 47 | - tensorboard 48 | reward_funcs: 49 | - accuracy 50 | - format 51 | reward_weights: 52 | - 1.0 53 | - 1.0 54 | save_strategy: "no" 55 | seed: 42 56 | warmup_ratio: 0.1 57 | -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_v1_nostd.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Model arguments 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | beta: 0.0 17 | bf16: true 18 | use_vllm: false 19 | vllm_device: auto 20 | vllm_gpu_memory_utilization: 0.7 21 | do_eval: true 22 | eval_strategy: steps 23 | eval_steps: 100 24 | gradient_accumulation_steps: 4 25 | gradient_checkpointing: true 26 | gradient_checkpointing_kwargs: 27 | use_reentrant: false 28 | hub_model_id: Qwen-2.5-7B-Simple-RL 29 | hub_strategy: every_save 30 | learning_rate: 3.0e-06 31 | log_completions: false 32 | log_level: info 33 | logging_first_step: true 34 | logging_steps: 5 35 | logging_strategy: steps 36 | lr_scheduler_type: cosine 37 | max_prompt_length: 512 38 | max_completion_length: 1024 39 | max_steps: -1 40 | num_generations: 8 41 | num_train_epochs: 1 42 | output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-wostd 43 | overwrite_output_dir: true 44 | per_device_eval_batch_size: 16 45 | per_device_train_batch_size: 16 46 | push_to_hub: false 47 | report_to: 48 | - tensorboard 49 | reward_funcs: 50 | - accuracy 51 | - format 52 | reward_weights: 53 | - 1.0 54 | - 1.0 55 | save_strategy: "no" 56 | seed: 42 57 | warmup_ratio: 0.1 58 | scale_rewards: false -------------------------------------------------------------------------------- /open-r1/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_wokl.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | #model_name_or_path: Qwen/Qwen2.5-Math-7B 3 | model_name_or_path: models/Qwen2.5-Math-7B 4 | 5 | model_revision: main 6 | torch_dtype: bfloat16 7 | attn_implementation: flash_attention_2 8 | 9 | # Data training arguments 10 | #dataset_name: DigitalLearningGmbH/MATH-lighteval 11 | dataset_name: "datas/MATH-lighteval" 12 | dataset_config: default 13 | system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within and tags." 14 | 15 | # GRPO trainer config 16 | bf16: true 17 | use_vllm: false 18 | do_eval: true 19 | eval_strategy: steps 20 | eval_steps: 100 21 | gradient_accumulation_steps: 2 22 | gradient_checkpointing: true 23 | gradient_checkpointing_kwargs: 24 | use_reentrant: false 25 | hub_model_id: Qwen-2.5-7B-Simple-RL 26 | hub_strategy: every_save 27 | learning_rate: 3.0e-06 28 | log_completions: false 29 | log_level: info 30 | logging_first_step: true 31 | logging_steps: 5 32 | logging_strategy: steps 33 | lr_scheduler_type: cosine 34 | max_prompt_length: 512 35 | max_completion_length: 1024 36 | max_steps: -1 37 | num_generations: 8 38 | num_train_epochs: 1 39 | output_dir: data/Qwen-2.5-7B-Simple-RL-TRPO-wokl 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 16 42 | per_device_train_batch_size: 16 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - accuracy 48 | - format 49 | reward_weights: 50 | - 1.0 51 | - 1.0 52 | save_strategy: "no" 53 | seed: 42 54 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-r1/recipes/README.md: -------------------------------------------------------------------------------- 1 | # Post-training recipes 2 | 3 | ## OlympicCoder 4 | 5 | To train the OlympicCoder models, run: 6 | 7 | ``` 8 | # 7B 9 | sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3 10 | 11 | # 32B 12 | sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp 13 | ``` 14 | 15 | Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size. -------------------------------------------------------------------------------- /open-r1/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: sdpa 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/OpenR1-Math-220k 10 | dataset_num_proc: 48 11 | 12 | #SFT hyperparam 13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 14 | weight_decay: 0.0001 15 | optim: adamw_torch 16 | lr_scheduler_type: linear 17 | warmup_ratio: 0.1 18 | learning_rate: 5.0e-05 19 | gradient_accumulation_steps: 2 20 | per_device_eval_batch_size: 4 21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 22 | 23 | # SFT trainer config 24 | max_steps: -1 25 | num_train_epochs: 3 26 | bf16: true 27 | do_eval: false 28 | eval_strategy: 'no' 29 | gradient_checkpointing: true 30 | gradient_checkpointing_kwargs: 31 | use_reentrant: false 32 | hub_model_id: OpenR1-Qwen-7B-SFT 33 | hub_strategy: every_save 34 | log_level: info 35 | logging_steps: 5 36 | logging_strategy: steps 37 | packing: true 38 | output_dir: data/OpenR1-Qwen-7B-SFT 39 | overwrite_output_dir: true 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | save_strategy: "steps" 44 | save_steps: 500 45 | save_total_limit: 1 46 | seed: 42 47 | -------------------------------------------------------------------------------- /open-r1/recipes/SmolLM2-1.7B/sft/config.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768 3 | model_name_or_path: HuggingFaceTB/SmolLM2-1.7B 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: sdpa 7 | 8 | # Data training arguments 9 | dataset_name: open-r1/OpenR1-Math-220k 10 | dataset_num_proc: 48 11 | 12 | #SFT hyperparam 13 | max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 14 | weight_decay: 0.0001 15 | optim: adamw_torch 16 | lr_scheduler_type: linear 17 | warmup_ratio: 0.1 18 | learning_rate: 5.0e-05 19 | gradient_accumulation_steps: 2 20 | per_device_eval_batch_size: 4 21 | per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 22 | 23 | # SFT trainer config 24 | max_steps: -1 25 | num_train_epochs: 3 26 | bf16: true 27 | do_eval: false 28 | eval_strategy: 'no' 29 | gradient_checkpointing: true 30 | gradient_checkpointing_kwargs: 31 | use_reentrant: false 32 | hub_model_id: OpenR1-Qwen-7B-SFT 33 | hub_strategy: every_save 34 | log_level: info 35 | logging_steps: 5 36 | logging_strategy: steps 37 | packing: true 38 | output_dir: data/OpenR1-Qwen-7B-SFT 39 | overwrite_output_dir: true 40 | push_to_hub: true 41 | report_to: 42 | - wandb 43 | save_strategy: "steps" 44 | save_steps: 500 45 | save_total_limit: 1 46 | seed: 42 47 | -------------------------------------------------------------------------------- /open-r1/recipes/accelerate_configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /open-r1/recipes/accelerate_configs/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610 8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 9 | fsdp_backward_prefetch: BACKWARD_PRE 10 | fsdp_cpu_ram_efficient_loading: true 11 | fsdp_forward_prefetch: true 12 | fsdp_offload_params: false 13 | fsdp_sharding_strategy: FULL_SHARD 14 | fsdp_state_dict_type: FULL_STATE_DICT 15 | fsdp_sync_module_states: true 16 | fsdp_use_orig_params: true 17 | machine_rank: 0 18 | main_training_function: main 19 | mixed_precision: bf16 20 | num_machines: 1 21 | num_processes: 8 22 | rdzv_backend: static 23 | same_network: true 24 | tpu_env: [] 25 | tpu_use_cluster: false 26 | tpu_use_sudo: false 27 | use_cpu: false -------------------------------------------------------------------------------- /open-r1/recipes/accelerate_configs/zero1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 1 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /open-r1/recipes/accelerate_configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /open-r1/recipes/accelerate_configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /open-r1/scripts/get_tensor_parallel_size.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoConfig 3 | from math import gcd 4 | 5 | def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int: 6 | try: 7 | config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True) 8 | num_heads = getattr(config, 'num_attention_heads', None) 9 | 10 | if num_heads is not None and num_heads % default_tp != 0: 11 | tp = gcd(num_heads, default_tp) 12 | return max(tp, 1) 13 | else: 14 | return default_tp 15 | except Exception as e: 16 | print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}") 17 | return default_tp 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path") 22 | parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable") 23 | parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)") 24 | 25 | args = parser.parse_args() 26 | 27 | tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp) 28 | print(tp) 29 | -------------------------------------------------------------------------------- /open-r1/scripts/upload_details.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2025 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Push the details from a LightEval run to the Hub. 17 | 18 | Usage: 19 | 20 | python src/open_r1/utils/upload_details.py \ 21 | --data_files {path_to_parquet_file} \ 22 | --hub_repo_id {hub_repo_id} \ 23 | --config_name {config_name} 24 | """ 25 | 26 | from dataclasses import dataclass, field 27 | from typing import List 28 | 29 | from datasets import load_dataset 30 | from transformers import HfArgumentParser 31 | 32 | 33 | @dataclass 34 | class ScriptArguments: 35 | data_files: List[str] = field(default_factory=list) 36 | hub_repo_id: str = None 37 | config_name: str = None 38 | 39 | 40 | def main(): 41 | parser = HfArgumentParser(ScriptArguments) 42 | args = parser.parse_args_into_dataclasses()[0] 43 | 44 | if all(file.endswith(".json") for file in args.data_files): 45 | ds = load_dataset("json", data_files=args.data_files) 46 | elif all(file.endswith(".jsonl") for file in args.data_files): 47 | ds = load_dataset("json", data_files=args.data_files) 48 | else: 49 | ds = load_dataset("parquet", data_files=args.data_files) 50 | url = ds.push_to_hub(args.hub_repo_id, config_name=args.config_name, private=True) 51 | print(f"Dataset available at: {url}") 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /open-r1/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /open-r1/slurm/README.md: -------------------------------------------------------------------------------- 1 | ## Serving DeepSeek-R1 on 2x8 H100 SLURM nodes with SGLang 2 | 3 | 1. Set up the environment (adjust for your cuda version): 4 | ```bash 5 | conda create -n sglang124 python=3.11 6 | conda activate sglang124 7 | 8 | pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124 9 | 10 | pip install sgl-kernel --force-reinstall --no-deps 11 | pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/ 12 | ``` 13 | 14 | 2. Run the server and wait for the model to load: 15 | ```bash 16 | sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124" 17 | ``` 18 | 19 | 3. Run the data generation script: 20 | ```bash 21 | python scripts/generate_reasoning.py \ 22 | --dataset-name "AI-MO/NuminaMath-1.5" \ 23 | --output-file "numinamath_r1_generations.jsonl" \ 24 | --prompt-column "problem" \ 25 | --uuid-column "problem" \ 26 | --api-addr ":39877" \ 27 | --num-generations 2 \ 28 | --max-tokens 16384 \ 29 | --max-concurrent 200 30 | ``` -------------------------------------------------------------------------------- /open-r1/slurm/piston/launch_piston_workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this simple script will launch a bunch of piston workers on the HF science cluster 4 | 5 | N_INSTANCES=${1:-5} # Default to 5 instances 6 | 7 | for i in $(seq 1 $N_INSTANCES); do 8 | # Find random (hopefully) available port 9 | PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1) 10 | 11 | # the job name format is important for the code to then be able to get a list of workers. `piston-worker-` 12 | sbatch \ 13 | --job-name="piston-worker-$PORT" \ 14 | --export=ALL,PORT=$PORT \ 15 | slurm/piston/launch_single_piston.sh 16 | done -------------------------------------------------------------------------------- /open-r1/slurm/piston/launch_single_piston.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=piston_worker 3 | #SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out 4 | #SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out # Redirect error logs to .out 5 | #SBATCH --cpus-per-task=2 6 | #SBATCH --mem-per-cpu=1950M 7 | #SBATCH --partition=hopper-cpu 8 | #SBATCH --time=48:00:00 9 | 10 | # sometimes if a bunch of workers start at the same time pyxis dies 11 | sleep $(( RANDOM % 20 )) 12 | 13 | # mounting the packages folder lets us not have to manually install the package on each instance 14 | # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility) 15 | # feel free try with the latest image 16 | # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package 17 | srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \ 18 | bash -c " 19 | export PISTON_COMPILE_TIMEOUT=60000 20 | export PISTON_RUN_TIMEOUT=60000 21 | export PISTON_OUTPUT_MAX_SIZE=1000000000 22 | export PISTON_MAX_FILE_SIZE=1000000000 23 | export PISTON_DISABLE_NETWORKING=true 24 | export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index 25 | 26 | sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js 27 | sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js 28 | 29 | # Start server in background 30 | node src 31 | " 32 | -------------------------------------------------------------------------------- /open-r1/slurm/serve_router.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=r1-router 3 | #SBATCH --partition=hopper-cpu 4 | #SBATCH --qos=high 5 | #SBATCH --nodes=1 6 | #SBATCH --cpus-per-task=8 7 | #SBATCH --mem-per-cpu=1875m 8 | #SBATCH --output=./logs/%x_%j_%n.out 9 | #SBATCH --error=./logs/%x_%j_%n.err 10 | #SBATCH --time=30-00:00:00 11 | #SBATCH --requeue 12 | 13 | set -exuo pipefail 14 | 15 | # TODO: Adjust these variables to your cluster configuration 16 | CONDA_ENV="sglang124" 17 | ROUTER_PORT=39876 18 | 19 | trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1 20 | 21 | while getopts "e:h" opt; do 22 | case $opt in 23 | e) CONDA_ENV="$OPTARG" ;; 24 | h|?) echo "Usage: sbatch $0 [-e CONDA_ENV]"; exit 1 ;; 25 | esac 26 | done 27 | 28 | # TODO: Environment setup, adjust to your cluster configuration 29 | source ~/.bashrc 30 | source "$CONDA_PREFIX/etc/profile.d/conda.sh" 31 | conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; } 32 | 33 | python -m sglang_router.launch_router \ 34 | --port "$ROUTER_PORT" \ 35 | --host 0.0.0.0 \ 36 | --worker-startup-timeout-secs 300 37 | 38 | # Keep the job running with health checks 39 | while true; do 40 | if ! curl -s -o /dev/null "http://localhost:$ROUTER_PORT/health"; then 41 | echo "Error: Router health check failed" 42 | exit 1 43 | fi 44 | sleep 300 45 | done -------------------------------------------------------------------------------- /open-r1/src/open_r1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /open-r1/src/open_r1/test_dataset.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | import torch 3 | from open_r1.utils.data_utils import custom_loading_dataset 4 | 5 | # 加载预训练的分词器 6 | tokenizer = AutoTokenizer.from_pretrained("models/Qwen2.5-Math-7B") 7 | dataset = custom_loading_dataset("datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5", tokenizer=tokenizer) 8 | 9 | 10 | def make_conversation_math35(example): 11 | prompt = [] 12 | # prompt.append({"role": "user", "content": example["instruction"][0]['content']}) 13 | prompt = example["instruction"][0]['content'] 14 | # prompt.append({"role": "user", "content": example["problem"]}) 15 | return {"prompt": prompt} 16 | 17 | dataset = dataset.map(make_conversation_math35) 18 | 19 | # 初始化最大长度变量 20 | max_length = 0 21 | 22 | # 遍历数据集,计算每个样本的长度 23 | for text in dataset['train']: 24 | # 使用分词器对文本进行编码 25 | text = text['prompt'] 26 | print(text) 27 | inputs = tokenizer(text, return_tensors="pt", padding=False, truncation=False) 28 | # 获取输入的长度 29 | length = inputs["input_ids"].shape[1] 30 | # 更新最大长度 31 | if length > max_length: 32 | max_length = length 33 | 34 | print(f"Maximum length after tokenization: {max_length}") -------------------------------------------------------------------------------- /open-r1/src/open_r1/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .import_utils import is_e2b_available 2 | from .model_utils import get_tokenizer 3 | 4 | 5 | __all__ = ["get_tokenizer", "is_e2b_available"] 6 | -------------------------------------------------------------------------------- /open-r1/src/open_r1/utils/import_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers.utils.import_utils import _is_package_available 16 | 17 | 18 | # Use same as transformers.utils.import_utils 19 | _e2b_available = _is_package_available("e2b") 20 | 21 | 22 | def is_e2b_available() -> bool: 23 | return _e2b_available 24 | -------------------------------------------------------------------------------- /open-r1/src/open_r1/utils/ioi/__init__.py: -------------------------------------------------------------------------------- 1 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints 2 | from .scoring import SubtaskResult, score_subtask 3 | from .utils import add_includes 4 | 5 | 6 | __all__ = [ 7 | "get_piston_client_from_env", 8 | "get_slurm_piston_endpoints", 9 | "score_subtask", 10 | "add_includes", 11 | "SubtaskResult", 12 | ] 13 | -------------------------------------------------------------------------------- /open-r1/src/open_r1/utils/ioi/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from functools import lru_cache 3 | from itertools import islice 4 | 5 | from datasets import load_dataset 6 | 7 | 8 | def add_includes(code: str, problem_id: str) -> str: 9 | """ 10 | Fix common compilation errors for IOI problems. 11 | """ 12 | if not code: 13 | return code 14 | # has most of the useful functions 15 | code_header = "#include \n" 16 | # include the problem header 17 | problem_header_include = f'#include "{problem_id}.h"' 18 | if problem_header_include not in code: 19 | code_header += problem_header_include + "\n" 20 | # use namespace std since models forget std:: often 21 | if "using namespace std;" not in code and "std::" not in code: 22 | code_header += "\nusing namespace std;\n\n" 23 | return code_header + code 24 | 25 | 26 | @lru_cache 27 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]: 28 | """ 29 | Load IOI tests for a given year. 30 | """ 31 | tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train") 32 | test_cases = defaultdict(dict) 33 | for test_case in tests_dataset: 34 | test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"] 35 | return test_cases 36 | 37 | 38 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]: 39 | """ 40 | Load IOI tests for a given year and problem id. 41 | """ 42 | return load_ioi_tests_for_year(year)[problem_id] 43 | 44 | 45 | def batched(iterable, n): 46 | "Batch data into lists of length n. The last batch may be shorter." 47 | # batched('ABCDEFG', 3) --> ABC DEF G 48 | if n < 1: 49 | return iterable 50 | it = iter(iterable) 51 | while batch := list(islice(it, n)): 52 | yield batch 53 | -------------------------------------------------------------------------------- /open-r1/src/open_r1/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, PreTrainedTokenizer 2 | 3 | from trl import ModelConfig 4 | 5 | from ..configs import GRPOConfig, SFTConfig 6 | 7 | 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" 9 | 10 | 11 | def get_tokenizer( 12 | model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True 13 | ) -> PreTrainedTokenizer: 14 | """Get the tokenizer for the model.""" 15 | tokenizer = AutoTokenizer.from_pretrained( 16 | model_args.model_name_or_path, 17 | revision=model_args.model_revision, 18 | trust_remote_code=model_args.trust_remote_code, 19 | ) 20 | 21 | if training_args.chat_template is not None: 22 | tokenizer.chat_template = training_args.chat_template 23 | elif auto_set_chat_template and tokenizer.get_chat_template() is None: 24 | tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE 25 | 26 | return tokenizer 27 | -------------------------------------------------------------------------------- /open-r1/src/open_r1/utils/wandb_logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def init_wandb_training(training_args): 5 | """ 6 | Helper function for setting up Weights & Biases logging tools. 7 | """ 8 | if training_args.wandb_entity is not None: 9 | os.environ["WANDB_ENTITY"] = training_args.wandb_entity 10 | if training_args.wandb_project is not None: 11 | os.environ["WANDB_PROJECT"] = training_args.wandb_project 12 | -------------------------------------------------------------------------------- /open-r1/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMAP-ML/GPG/e096da8ce527a1636272469cee383ba71ec4a511/open-r1/tests/__init__.py -------------------------------------------------------------------------------- /open-r1/tests/transformer_ds_qwen_15B_R1.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_params: 3 | model_args: "pretrained=models/DeepSeek-R1-Distill-Qwen-1.5B" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... 4 | dtype: "bfloat16" 5 | compile: false 6 | merged_weights: # Ignore this section if you are not using PEFT models 7 | delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name 8 | adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name 9 | base_model: null # path to the base_model 10 | generation: 11 | # multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing 12 | max_new_tokens: 4096 #32768 we use a small to control the infer speed. 13 | temperature: 0.6 14 | top_p: 0.95 15 | -------------------------------------------------------------------------------- /open-r1/tests/transformer_ds_qwen_15B_R1_retrain.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_params: 3 | model_args: "your_path/Qwen2.5-1.5B-Open-R1-GPG" # in fact this GRPO from scratch. 4 | dtype: "bfloat16" 5 | compile: false 6 | merged_weights: # Ignore this section if you are not using PEFT models 7 | delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name 8 | adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name 9 | base_model: null # path to the base_model 10 | generation: 11 | # multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing 12 | max_new_tokens: 4096 #32768 we use a small to control the infer speed. 13 | temperature: 0.6 14 | top_p: 0.95 15 | -------------------------------------------------------------------------------- /open-r1/train.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=src 2 | 3 | accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \ 4 | --num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ 5 | src/open_r1/gpg.py --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1.yaml --output_dir Your_Path \ 6 | --save_strategy "epoch" --save_total_limit 5 --num_train_epochs 5 --gradient_accumulation_steps 4 --max_completion_length 2048 --max_prompt_length 768 \ 7 | --scale_rewards False --adjust_gd --min_inverse_alpha 0.5 --eval_strategy epoch \ -------------------------------------------------------------------------------- /open-rs/recipes/accelerate_configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /open-rs/recipes/accelerate_configs/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610 8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 9 | fsdp_backward_prefetch: BACKWARD_PRE 10 | fsdp_cpu_ram_efficient_loading: true 11 | fsdp_forward_prefetch: true 12 | fsdp_offload_params: false 13 | fsdp_sharding_strategy: FULL_SHARD 14 | fsdp_state_dict_type: FULL_STATE_DICT 15 | fsdp_sync_module_states: true 16 | fsdp_use_orig_params: true 17 | machine_rank: 0 18 | main_training_function: main 19 | mixed_precision: bf16 20 | num_machines: 1 21 | num_processes: 8 22 | rdzv_backend: static 23 | same_network: true 24 | tpu_env: [] 25 | tpu_use_cluster: false 26 | tpu_use_sudo: false 27 | use_cpu: false -------------------------------------------------------------------------------- /open-rs/recipes/accelerate_configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /open-rs/recipes/accelerate_configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /open-rs/recipes/data_cleaner.yaml: -------------------------------------------------------------------------------- 1 | model_kwargs: 2 | model: Qwen/Qwen2.5-Math-7B-Instruct 3 | trust_remote_code: true 4 | max_model_len: 4096 5 | gpu_memory_utilization: 0.9 6 | enforce_eager: true 7 | tensor_parallel_size: 4 8 | 9 | sampling_params: 10 | temperature: 0.7 11 | top_p: 0.9 12 | max_tokens: 4096 13 | -------------------------------------------------------------------------------- /open-rs/recipes/gpg.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/open-s1 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 100 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GPG-RS1 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-rs/recipes/gpg_7B.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/open-s1 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GPG-7B 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-rs/recipes/gpg_std.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/open-s1 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GPG-std-new 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 57 | scale_rewards: false -------------------------------------------------------------------------------- /open-rs/recipes/grpo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | # model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | 8 | # Data training arguments 9 | dataset_name: datas/open-rs/open-s1 10 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 11 | 12 | # GRPO trainer config 13 | bf16: true 14 | use_vllm: false 15 | vllm_device: auto 16 | vllm_enforce_eager: true 17 | vllm_gpu_memory_utilization: 0.7 18 | vllm_max_model_len: 4608 19 | do_eval: false 20 | gradient_accumulation_steps: 4 21 | gradient_checkpointing: true 22 | gradient_checkpointing_kwargs: 23 | use_reentrant: false 24 | hub_model_id: OpenRS-GRPO 25 | hub_strategy: every_save 26 | learning_rate: 1.0e-06 27 | log_completions: true 28 | log_level: info 29 | logging_first_step: true 30 | logging_steps: 1 31 | logging_strategy: steps 32 | lr_scheduler_type: cosine_with_min_lr 33 | lr_scheduler_kwargs: 34 | min_lr_rate: 0.1 35 | max_prompt_length: 512 36 | max_completion_length: 3584 37 | max_steps: 500 38 | num_generations: 6 39 | num_train_epochs: 1 40 | output_dir: data/OpenRS-GRPO 41 | overwrite_output_dir: true 42 | per_device_eval_batch_size: 6 43 | per_device_train_batch_size: 6 44 | push_to_hub: false 45 | report_to: 46 | - tensorboard 47 | reward_funcs: 48 | - format 49 | - cosine 50 | reward_weights: 51 | - 1.0 52 | - 2.0 53 | save_strategy: "steps" 54 | save_steps: 50 55 | seed: 42 56 | temperature: 0.7 57 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-rs/recipes/grpo_7B.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-7B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/open-s1 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GRPO-7B 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-rs/recipes/grpo_ng.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/open-s1 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 100 37 | num_generations: 2 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GRPO-rs1-ng2 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 18 42 | per_device_train_batch_size: 18 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-rs/recipes/grpo_wo_vllm.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: models/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: datas/open-rs/open-s1 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: false 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 100 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GRPO-rs3 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: false 44 | report_to: 45 | - tensorboard 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /open-rs/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /open-rs/src/open_r1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /open-rs/src/open_r1/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpg_trainer import GPGTrainer 2 | from .gpg_std_trainer import GPGSTDTrainer 3 | __all__ = ["GPGTrainer","GPGSTDTrainer"] -------------------------------------------------------------------------------- /open-rs/src/open_r1/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .import_utils import is_e2b_available 2 | from .model_utils import get_tokenizer 3 | 4 | 5 | __all__ = ["get_tokenizer", "is_e2b_available"] 6 | -------------------------------------------------------------------------------- /open-rs/src/open_r1/utils/import_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers.utils.import_utils import _is_package_available 16 | 17 | 18 | # Use same as transformers.utils.import_utils 19 | _e2b_available = _is_package_available("e2b") 20 | 21 | 22 | def is_e2b_available() -> bool: 23 | return _e2b_available 24 | -------------------------------------------------------------------------------- /open-rs/src/open_r1/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, PreTrainedTokenizer 2 | 3 | from trl import ModelConfig 4 | 5 | from ..configs import GRPOConfig, SFTConfig 6 | 7 | 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" 9 | 10 | 11 | def get_tokenizer( 12 | model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True 13 | ) -> PreTrainedTokenizer: 14 | """Get the tokenizer for the model.""" 15 | tokenizer = AutoTokenizer.from_pretrained( 16 | model_args.model_name_or_path, 17 | revision=model_args.model_revision, 18 | trust_remote_code=model_args.trust_remote_code, 19 | ) 20 | 21 | if training_args.chat_template is not None: 22 | tokenizer.chat_template = training_args.chat_template 23 | elif auto_set_chat_template and tokenizer.get_chat_template() is None: 24 | tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE 25 | 26 | return tokenizer 27 | -------------------------------------------------------------------------------- /open-rs/src/open_r1/utils/wandb_logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def init_wandb_training(training_args): 5 | """ 6 | Helper function for setting up Weights & Biases logging tools. 7 | """ 8 | if training_args.wandb_entity is not None: 9 | os.environ["WANDB_ENTITY"] = training_args.wandb_entity 10 | if training_args.wandb_project is not None: 11 | os.environ["WANDB_PROJECT"] = training_args.wandb_project 12 | -------------------------------------------------------------------------------- /open-rs/train.sh: -------------------------------------------------------------------------------- 1 | ACCELERATE_LOG_LEVEL=info accelerate launch \ 2 | --config_file recipes/accelerate_configs/zero2.yaml \ 3 | --num_processes=16 \ 4 | src/open_r1/gpg.py \ 5 | --config recipes/gpg.yaml & >> open-rs1-gpg.log --------------------------------------------------------------------------------