├── .gitignore ├── experiments ├── unsloth │ ├── v1_7B_dev │ │ ├── .config.yaml.swp │ │ └── config.yaml │ └── v1_dev │ │ └── config.yaml ├── grammar │ ├── sample_output.txt │ ├── unit_test_grammar.txt │ ├── unit_test_grammar.txt.bak │ └── unit_test_grammar.txt_old ├── vllm_config.yaml ├── u3_sft_7B │ └── run.sh ├── u3_sft_1e-4 │ └── run.sh ├── u3_sft_1e-5 │ └── run.sh ├── u3_sft_1e-6 │ └── run.sh ├── u3_sft_3e-4 │ └── run.sh ├── u3_sft_3e-5 │ └── run.sh ├── u1_7B │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u2_7B │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u1_14B │ ├── eval_50_config.yaml │ ├── eval_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u2_7B_3e-5 │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u2_7B_gram │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u4_7B_gram │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u5_7B_gram │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u6_7B_form │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u6_7B_fs │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_train_config.yaml │ ├── eval_pi_train_50_config.yaml │ └── run.sh ├── u6_7B_gram │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_50_config.yaml │ ├── eval_pi_train_config.yaml │ └── run.sh ├── u6_7B_sft │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_train_config.yaml │ ├── eval_pi_train_50_config.yaml │ └── run.sh ├── u6_colo │ ├── eval_50_config.yaml │ ├── eval_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_train_config.yaml │ ├── eval_pi_train_50_config.yaml │ └── run.sh ├── u2_7B_base_sft │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_train_config.yaml │ ├── eval_pi_train_50_config.yaml │ └── run.sh └── u2_7B_few_shot │ ├── eval_config.yaml │ ├── eval_50_config.yaml │ ├── eval_pi_50_config.yaml │ ├── eval_pi_config.yaml │ ├── eval_pi_train_config.yaml │ ├── eval_pi_train_50_config.yaml │ └── run.sh ├── unit_tests_server ├── add_reference_code.ipynb ├── convert_hf_to_verl_dataset.py ├── check_grammar.py ├── test_unit_test.py ├── verl_unit.py ├── train_unsloth.py ├── convert_dataset_to_chat.ipynb └── unsloth_train_patch.py ├── README.md ├── check_grammar.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | unit_tests_server/__pycache__/ 3 | -------------------------------------------------------------------------------- /experiments/unsloth/v1_7B_dev/.config.yaml.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosmineb/unit_test_rl/HEAD/experiments/unsloth/v1_7B_dev/.config.yaml.swp -------------------------------------------------------------------------------- /experiments/grammar/sample_output.txt: -------------------------------------------------------------------------------- 1 | Here is the Python code to solve the problem: 2 | 3 | ```python 4 | def sum(): 5 | a, b = map(int, input().split()) 6 | return a + b 7 | ``` 8 | ### Unit Tests 9 | 10 | 1 13 11 | -1 10<|im_end|> 12 | -------------------------------------------------------------------------------- /experiments/grammar/unit_test_grammar.txt: -------------------------------------------------------------------------------- 1 | root ::= intro codeBlock testBlock (end)* 2 | intro ::= [^`#]* 3 | newline ::= "\n" 4 | codeBlock ::= "```python" newline [^`]* "```" newline 5 | testBlock ::= testIntro (testLine)* 6 | testIntro ::= "### Unit Tests" newline newline 7 | notBracket ::= [^<]* 8 | testLine ::= "" notBracket "" notBracket "" (newline)* 9 | end ::= "<|im_end|>" 10 | 11 | -------------------------------------------------------------------------------- /experiments/grammar/unit_test_grammar.txt.bak: -------------------------------------------------------------------------------- 1 | root ::= intro codeBlock testBlock (end)* 2 | intro ::= [^`#]* 3 | newline ::= "\n" 4 | codeBlock ::= "```python" newline [^`#]* "```" newline 5 | testBlock ::= testIntro (testLine)* 6 | testIntro ::= "### Unit Tests" newline newline 7 | notBracket ::= [^<]* 8 | testLine ::= "" notBracket "" notBracket "" (newline)* 9 | end ::= "<|im_end|>" 10 | 11 | -------------------------------------------------------------------------------- /experiments/vllm_config.yaml: -------------------------------------------------------------------------------- 1 | label_field: "verification_info" 2 | base_model_name: "Qwen/Qwen2.5-Coder-7B-Instruct" 3 | device_groups: 4 | - 1 5 | - 2 6 | - 3 7 | - 4 8 | - 5 9 | - 6 10 | vllm_config: 11 | gpu_memory_utilization: 0.9 12 | max_model_len: 12288 13 | max_num_batched_tokens: 98304 14 | generation_num_copies: 1 15 | temperature: 1.0 16 | repetition_penalty: 0.0 17 | max_tokens: 12288 18 | tensor_parallel_size: 1 19 | batch_size: 200 20 | -------------------------------------------------------------------------------- /unit_tests_server/add_reference_code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "transformers", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "name": "python", 19 | "version": "3.11.0" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "nbformat_minor": 2 24 | } 25 | -------------------------------------------------------------------------------- /experiments/grammar/unit_test_grammar.txt_old: -------------------------------------------------------------------------------- 1 | root ::= intro codeBlock testBlock (end)* 2 | intro ::= "Here is the Python code to solve the problem:" newline newline 3 | newline ::= "\n" 4 | codeBlock ::= "```python\n" pythonBlock "```" newline newline 5 | pythonBlock ::= (pythonLine)* 6 | pythonLine ::= [^\n]* newline 7 | testBlock ::= testIntro (testLine)* 8 | testIntro ::= "### Unit Tests" newline newline 9 | testLine ::= "[^<]*[^<]*" (newline)* 10 | end ::= "<|im_end|>" 11 | 12 | -------------------------------------------------------------------------------- /experiments/unsloth/v1_dev/config.yaml: -------------------------------------------------------------------------------- 1 | model_name: "Qwen/Qwen2.5-Coder-1.5B-Instruct" 2 | load_in_4bit: false 3 | load_in_8bit: false 4 | max_seq_length: 4096 5 | lora_rank: 64 6 | lora_alpha: 128 7 | learning_rate: 1.0e-5 8 | num_generations: 2 9 | gradient_accumulation_steps: 4 10 | max_prompt_length: 2048 11 | max_steps: 1000 12 | save_steps: 5 13 | output_dir: /home/rosmine/data2/rl/v1_dev 14 | vllm_mode: "colocate" 15 | dataset_path: /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples_chat 16 | per_device_train_batch_size: 2 17 | generation_batch_size: 24 18 | reward_function_path: "./unit_tests_server/verl_unit.py" 19 | reward_function_name: "hf_reward_fn" 20 | -------------------------------------------------------------------------------- /experiments/unsloth/v1_7B_dev/config.yaml: -------------------------------------------------------------------------------- 1 | model_name: "Qwen/Qwen2.5-Coder-7B-Instruct" 2 | load_in_4bit: true 3 | load_in_8bit: false 4 | max_seq_length: 2560 5 | lora_rank: 64 6 | lora_alpha: 128 7 | learning_rate: 1.0e-5 8 | num_generations: 16 9 | #generation_batch_size: 512 10 | steps_per_generation: 4 11 | gradient_accumulation_steps: 4 12 | max_prompt_length: 2048 13 | max_steps: 1000 14 | save_steps: 5 15 | output_dir: /home/rosmine/data2/rl/v1_dev 16 | vllm_mode: "colocate" 17 | dataset_path: /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples_chat 18 | per_device_train_batch_size: 16 19 | reward_function_path: "./unit_tests_server/verl_unit.py" 20 | reward_function_name: "hf_reward_fn" 21 | -------------------------------------------------------------------------------- /experiments/u3_sft_7B/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u3_sft_7B 2 | 3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 4 | 5 | deepspeed --module openrlhf.cli.train_sft \ 6 | --max_len 2048 \ 7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \ 8 | --input_key prompt \ 9 | --output_key response \ 10 | --train_batch_size 48 \ 11 | --apply_chat_template \ 12 | --micro_train_batch_size 2 \ 13 | --max_samples 500000 \ 14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \ 16 | --save_steps 5 \ 17 | --logging_steps 1 \ 18 | --eval_steps -1 \ 19 | --zero_stage 2 \ 20 | --max_epochs 3 \ 21 | --bf16 \ 22 | --gradient_checkpointing \ 23 | --flash_attn \ 24 | --learning_rate 3e-6 \ 25 | --lora_rank 64 \ 26 | --lora_alpha 128 \ 27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \ 28 | 29 | -------------------------------------------------------------------------------- /experiments/u3_sft_1e-4/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u3_sft_1e-4 2 | 3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 4 | 5 | deepspeed --module openrlhf.cli.train_sft \ 6 | --max_len 2048 \ 7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \ 8 | --input_key prompt \ 9 | --output_key response \ 10 | --train_batch_size 48 \ 11 | --apply_chat_template \ 12 | --micro_train_batch_size 2 \ 13 | --max_samples 500000 \ 14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \ 16 | --save_steps 5 \ 17 | --logging_steps 1 \ 18 | --eval_steps -1 \ 19 | --zero_stage 2 \ 20 | --max_epochs 3 \ 21 | --bf16 \ 22 | --gradient_checkpointing \ 23 | --flash_attn \ 24 | --learning_rate 1e-4 \ 25 | --lora_rank 64 \ 26 | --lora_alpha 128 \ 27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \ 28 | 29 | -------------------------------------------------------------------------------- /experiments/u3_sft_1e-5/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u3_sft_1e-5 2 | 3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 4 | 5 | deepspeed --module openrlhf.cli.train_sft \ 6 | --max_len 2048 \ 7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \ 8 | --input_key prompt \ 9 | --output_key response \ 10 | --train_batch_size 48 \ 11 | --apply_chat_template \ 12 | --micro_train_batch_size 2 \ 13 | --max_samples 500000 \ 14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \ 16 | --save_steps 5 \ 17 | --logging_steps 1 \ 18 | --eval_steps -1 \ 19 | --zero_stage 2 \ 20 | --max_epochs 3 \ 21 | --bf16 \ 22 | --gradient_checkpointing \ 23 | --flash_attn \ 24 | --learning_rate 1e-5 \ 25 | --lora_rank 64 \ 26 | --lora_alpha 128 \ 27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \ 28 | 29 | -------------------------------------------------------------------------------- /experiments/u3_sft_1e-6/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u3_sft_1e-6 2 | 3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 4 | 5 | deepspeed --module openrlhf.cli.train_sft \ 6 | --max_len 2048 \ 7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \ 8 | --input_key prompt \ 9 | --output_key response \ 10 | --train_batch_size 48 \ 11 | --apply_chat_template \ 12 | --micro_train_batch_size 2 \ 13 | --max_samples 500000 \ 14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \ 16 | --save_steps 5 \ 17 | --logging_steps 1 \ 18 | --eval_steps -1 \ 19 | --zero_stage 2 \ 20 | --max_epochs 3 \ 21 | --bf16 \ 22 | --gradient_checkpointing \ 23 | --flash_attn \ 24 | --learning_rate 1e-6 \ 25 | --lora_rank 64 \ 26 | --lora_alpha 128 \ 27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \ 28 | 29 | -------------------------------------------------------------------------------- /experiments/u3_sft_3e-4/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u3_sft_3e-4 2 | 3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 4 | 5 | deepspeed --module openrlhf.cli.train_sft \ 6 | --max_len 2048 \ 7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \ 8 | --input_key prompt \ 9 | --output_key response \ 10 | --train_batch_size 48 \ 11 | --apply_chat_template \ 12 | --micro_train_batch_size 2 \ 13 | --max_samples 500000 \ 14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \ 16 | --save_steps 5 \ 17 | --logging_steps 1 \ 18 | --eval_steps -1 \ 19 | --zero_stage 2 \ 20 | --max_epochs 3 \ 21 | --bf16 \ 22 | --gradient_checkpointing \ 23 | --flash_attn \ 24 | --learning_rate 3e-4 \ 25 | --lora_rank 64 \ 26 | --lora_alpha 128 \ 27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \ 28 | 29 | -------------------------------------------------------------------------------- /experiments/u3_sft_3e-5/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u3_sft_3e-5 2 | 3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 4 | 5 | deepspeed --module openrlhf.cli.train_sft \ 6 | --max_len 2048 \ 7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \ 8 | --input_key prompt \ 9 | --output_key response \ 10 | --train_batch_size 48 \ 11 | --apply_chat_template \ 12 | --micro_train_batch_size 2 \ 13 | --max_samples 500000 \ 14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \ 16 | --save_steps 5 \ 17 | --logging_steps 1 \ 18 | --eval_steps -1 \ 19 | --zero_stage 2 \ 20 | --max_epochs 3 \ 21 | --bf16 \ 22 | --gradient_checkpointing \ 23 | --flash_attn \ 24 | --learning_rate 3e-5 \ 25 | --lora_rank 64 \ 26 | --lora_alpha 128 \ 27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \ 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # unit_test_rl 2 | Project code for training LLMs to write better unit tests + code 3 | 4 | Warning: This is experiment code and isn't necessarily production ready or easy to use (E.g. There may be hardcoded paths to local data files). I'm just sharing some experiments that I thought were fun. 5 | 6 | You may need this fork of OpenRLHF, I made some modifications to support guided decoding and pass arguments to the reward model: https://github.com/rosmineb/OpenRLHF branch custom_changes_main 7 | 8 | To run: 9 | 10 | 1. Start the reward server, `python unit_tests_server/unit_test_reward_server.py --workers 8 --port 5432` 11 | 2. Call the run.sh experiment `source experiments/u6_7B_form/run.sh` 12 | 13 | These experiments support using a grammar for guided decoding. See u6_7B_gram experiment. You can check that a grammar parses an output with the following code (but a warning: using an SFT'd model is much better than using a grammar and potentially less work) 14 | 15 | `python unit_tests_server/check_grammar.py --grammar_file experiments/unit_test/grammar/unit_test_grammar.txt --string_file experiments/unit_test/grammar/sample_output.txt --model_id Qwen/Qwen2.5-Coder-7B-Instruct` 16 | -------------------------------------------------------------------------------- /experiments/u1_7B/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_14B/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_14B/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_7B/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_colo/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_colo/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/eval_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/eval_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "labels" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "humaneval" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_14B/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_7B/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_colo/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_14B/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_7B/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_colo/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/eval_pi_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/eval_pi_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | prompt_field: "prompt" 10 | label_field: "verification_info" 11 | debug: false 12 | reward_server_host: "0.0.0.0" 13 | reward_server_port: 5000 14 | response_begin_str: null 15 | code_start_str: "```python\n" 16 | code_end_str: "```" 17 | device_groups: 18 | - "cuda:0" 19 | - "cuda:1" 20 | - "cuda:2" 21 | - "cuda:3" 22 | - "cuda:4" 23 | - "cuda:5" 24 | code_format: "pi_verifiable" 25 | success_threshold: 0 26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 27 | vllm_port: 8000 28 | max_new_tokens: 10240 29 | vllm_config: 30 | gpu_memory_utilization: 0.9 31 | max_model_len: 12288 32 | max_num_batched_tokens: 98304 33 | generation_num_copies: 1 34 | temperature: 0.5 35 | repetition_penalty: 0.0 36 | max_tokens: 12288 37 | tensor_parallel_size: 1 38 | 39 | -------------------------------------------------------------------------------- /experiments/u1_14B/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u1_14B/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u1_7B/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u1_7B/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_colo/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/eval_pi_train_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u6_colo/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/eval_pi_train_50_config.yaml: -------------------------------------------------------------------------------- 1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated" 2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward" 3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train" 4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged" 5 | batch_size: 10 6 | max_tokens: 1024 7 | temperature: 0.7 8 | apply_chat_template: false 9 | max_num_samples: 252 10 | prompt_field: "prompt" 11 | label_field: "verification_info" 12 | debug: false 13 | reward_server_host: "0.0.0.0" 14 | reward_server_port: 5000 15 | response_begin_str: null 16 | code_start_str: "```python\n" 17 | code_end_str: "```" 18 | device_groups: 19 | - "cuda:0" 20 | - "cuda:1" 21 | - "cuda:2" 22 | - "cuda:3" 23 | - "cuda:4" 24 | - "cuda:5" 25 | code_format: "pi_verifiable" 26 | success_threshold: 0 27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" 28 | vllm_port: 8000 29 | max_new_tokens: 10240 30 | vllm_config: 31 | gpu_memory_utilization: 0.9 32 | max_model_len: 12288 33 | max_num_batched_tokens: 98304 34 | generation_num_copies: 1 35 | temperature: 0.5 36 | repetition_penalty: 0.0 37 | max_tokens: 12288 38 | tensor_parallel_size: 1 39 | 40 | -------------------------------------------------------------------------------- /unit_tests_server/convert_hf_to_verl_dataset.py: -------------------------------------------------------------------------------- 1 | # save as preprocess_mydata.py 2 | from datasets import load_from_disk 3 | import argparse, os, re 4 | 5 | def parse_args(): 6 | parser = argparse.ArgumentParser(description="Convert HuggingFace dataset to veRL format") 7 | parser.add_argument("--dataset_path", required=True, help="Path to the HuggingFace dataset") 8 | parser.add_argument("--out_dir", required=True, help="Output directory for the converted dataset") 9 | return parser.parse_args() 10 | 11 | def extract_dataset_name(dataset_path): 12 | """ 13 | Extract the dataset name from the dataset path. 14 | If the path is a file path, extract the filename without extension. 15 | If the path is a HuggingFace dataset ID, use the last part of the ID. 16 | """ 17 | # Check if it's a file path 18 | if os.path.exists(dataset_path): 19 | return os.path.splitext(os.path.basename(dataset_path))[0] 20 | 21 | # Otherwise, assume it's a HuggingFace dataset ID (e.g., "myorg/mydata") 22 | return dataset_path.split('/')[-1] 23 | 24 | def main(): 25 | args = parse_args() 26 | 27 | # Extract dataset name from the path 28 | dataset_name = extract_dataset_name(args.dataset_path) 29 | 30 | # Load the dataset 31 | ds = load_from_disk(args.dataset_path) 32 | 33 | # Create output directory 34 | os.makedirs(args.out_dir, exist_ok=True) 35 | 36 | # Process and save the dataset splits 37 | 38 | output_path = f"{args.out_dir}.parquet" 39 | 40 | ds.map( 41 | make_map_fn(args), 42 | with_indices=True 43 | ).to_parquet(output_path) 44 | 45 | print(f"Dataset converted and saved to {output_path}") 46 | 47 | 48 | # 1️⃣ optional helper – extract your “ground truth” 49 | def extract_gt(example): 50 | return example["label"] # adapt to your dataset 51 | 52 | # 2️⃣ build the mapping function veRL asks for 53 | def make_map_fn(args): 54 | DATA_SOURCE = extract_dataset_name(args.dataset_path) 55 | def _map(example, idx): 56 | prompt_txt = example["prompt"].strip() 57 | # if your template needs a system msg, add it here 58 | row = { 59 | "data_source": DATA_SOURCE, 60 | "prompt": [{"role": "user", "content": prompt_txt}], 61 | "ability": "open_ended", 62 | "reward_model": { 63 | "style": "rule", 64 | "ground_truth": example['verification_info'] 65 | }, 66 | "extra_info": {"gold_standard_solution": example['gold_standard_solution']} 67 | } 68 | return row 69 | return _map 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /check_grammar.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import xgrammar as xgr 3 | from transformers import AutoTokenizer, AutoConfig 4 | import pdb 5 | 6 | def parse_arguments(): 7 | """ 8 | Parse command line arguments for grammar checking. 9 | 10 | Returns: 11 | argparse.Namespace: The parsed command line arguments. 12 | """ 13 | parser = argparse.ArgumentParser(description="Check if a string conforms to a specified grammar.") 14 | parser.add_argument("--grammar_file", type=str, help="Path to the file containing the grammar definition.") 15 | parser.add_argument("--string_file", type=str, help="Path to the file containing the string to check against the grammar.") 16 | parser.add_argument("--model_id", type=str, default="meta-llama/Llama-3.2-1B-Instruct", help="Model ID to use for tokenization.") 17 | 18 | return parser.parse_args() 19 | 20 | if __name__ == "__main__": 21 | args = parse_arguments() 22 | 23 | with open(args.string_file, 'r') as f: 24 | candidate = f.read().strip() 25 | print(f"candidate: {candidate}") 26 | 27 | with open(args.grammar_file, 'r') as f: 28 | ebnf_source = f.read() 29 | 30 | print(f'Grammar: \n{ebnf_source}') 31 | 32 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 33 | full_vocab_size = AutoConfig.from_pretrained(args.model_id).vocab_size 34 | tok_info = xgr.TokenizerInfo.from_huggingface(tokenizer, 35 | vocab_size=full_vocab_size) 36 | 37 | compiler = xgr.GrammarCompiler(tok_info) #, allow_isolated_special=True) 38 | # three common options ⬇︎ 39 | # compiled = compiler.compile_builtin_json_grammar() # built-in JSON 40 | # compiled = compiler.compile_json_schema(schema_str) # JSON schema 41 | compiled = compiler.compile_grammar(ebnf_source) # any EBNF text 42 | 43 | matcher = xgr.GrammarMatcher(compiled) 44 | 45 | token_ids = tokenizer.encode(candidate, add_special_tokens=False) 46 | 47 | ok = True 48 | good_up_to = [] 49 | failed_token = None 50 | for tok in token_ids: 51 | if not matcher.accept_token(tok): # returns False = mismatch 52 | ok = False 53 | failed_token = tok 54 | break 55 | else: 56 | good_up_to.append(tok) 57 | 58 | # the string matches the grammar **iff** 59 | # • every token was accepted, and 60 | # • the matcher reached a terminal state. 61 | ok = ok and matcher.is_terminated() # end must be legal 62 | print("valid!" if ok else "invalid!") 63 | if not ok: 64 | print("parsed up to:") 65 | print(tokenizer.decode(good_up_to)) 66 | print(f"failed token: {tokenizer.decode(failed_token)} {failed_token}") 67 | 68 | print(f"ok: {ok}") -------------------------------------------------------------------------------- /unit_tests_server/check_grammar.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import xgrammar as xgr 3 | from transformers import AutoTokenizer, AutoConfig 4 | import pdb 5 | 6 | def parse_arguments(): 7 | """ 8 | Parse command line arguments for grammar checking. 9 | 10 | Returns: 11 | argparse.Namespace: The parsed command line arguments. 12 | """ 13 | parser = argparse.ArgumentParser(description="Check if a string conforms to a specified grammar.") 14 | parser.add_argument("--grammar_file", type=str, help="Path to the file containing the grammar definition.") 15 | parser.add_argument("--string_file", type=str, help="Path to the file containing the string to check against the grammar.") 16 | parser.add_argument("--model_id", type=str, default="meta-llama/Llama-3.2-1B-Instruct", help="Model ID to use for tokenization.") 17 | 18 | return parser.parse_args() 19 | 20 | if __name__ == "__main__": 21 | args = parse_arguments() 22 | 23 | with open(args.string_file, 'r') as f: 24 | candidate = f.read().strip() 25 | print(f"candidate: {candidate}") 26 | 27 | with open(args.grammar_file, 'r') as f: 28 | ebnf_source = f.read() 29 | 30 | print(f'Grammar: \n{ebnf_source}') 31 | 32 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 33 | full_vocab_size = AutoConfig.from_pretrained(args.model_id).vocab_size 34 | tok_info = xgr.TokenizerInfo.from_huggingface(tokenizer, 35 | vocab_size=full_vocab_size) 36 | 37 | compiler = xgr.GrammarCompiler(tok_info) #, allow_isolated_special=True) 38 | # three common options ⬇︎ 39 | # compiled = compiler.compile_builtin_json_grammar() # built-in JSON 40 | # compiled = compiler.compile_json_schema(schema_str) # JSON schema 41 | compiled = compiler.compile_grammar(ebnf_source) # any EBNF text 42 | 43 | matcher = xgr.GrammarMatcher(compiled) 44 | 45 | token_ids = tokenizer.encode(candidate, add_special_tokens=False) 46 | 47 | ok = True 48 | good_up_to = [] 49 | failed_token = None 50 | for tok in token_ids: 51 | if not matcher.accept_token(tok): # returns False = mismatch 52 | ok = False 53 | failed_token = tok 54 | break 55 | else: 56 | good_up_to.append(tok) 57 | 58 | # the string matches the grammar **iff** 59 | # • every token was accepted, and 60 | # • the matcher reached a terminal state. 61 | ok = ok and matcher.is_terminated() # end must be legal 62 | print("valid!" if ok else "invalid!") 63 | if not ok: 64 | print("parsed up to:") 65 | print(tokenizer.decode(good_up_to)) 66 | print(f"failed token: {tokenizer.decode(failed_token)} {failed_token}") 67 | pdb.set_trace() 68 | 69 | print(f"ok: {ok}") -------------------------------------------------------------------------------- /unit_tests_server/test_unit_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import pdb 4 | from datasets import load_from_disk 5 | from transformers import AutoTokenizer 6 | import concurrent.futures 7 | 8 | def make_api_call(payload, url="http://0.0.0.0:5000/get_reward"): 9 | headers = { 10 | "Content-Type": "application/json" 11 | } 12 | response = requests.post(url, data=json.dumps(payload), headers=headers) 13 | 14 | if response.status_code == 200: 15 | response_json = response.json() 16 | print(f"API call successful. Response: {response_json}") 17 | return True 18 | else: 19 | print(f"API call failed with status code {response.status_code}. Response:") 20 | print(response.text) 21 | return False 22 | 23 | if __name__ == "__main__": 24 | 25 | n_queries = 1 26 | n_calls = 2 27 | n_calls_parallel = 8 28 | 29 | correct_code = r""" 30 | a = int(input()) 31 | b = int(input()) 32 | print(a + b) 33 | """ 34 | 35 | incorrect_code = r""" 36 | a = int(input()) 37 | b = int(input()) 38 | print(a * b) 39 | """ 40 | 41 | unit_test = r"""1 42 | 2 43 | """ 44 | 45 | unit_test_correct_output = r"""3 46 | """ 47 | 48 | unit_test_incorrect_output = r"""2 49 | """ 50 | 51 | pairs = [ 52 | (correct_code, unit_test, unit_test_correct_output), # 1 53 | # (incorrect_code, unit_test, unit_test_correct_output), # 0 54 | # (incorrect_code, unit_test, unit_test_incorrect_output), # -1 55 | ] 56 | 57 | labels = "{'test_cases': [{'type': 'stdin_stdout', 'input': '1\\n2', 'output': '3'}], " + "'reference_implementation': " + '"' + correct_code.replace("\n", "\\n") + '"}' 58 | 59 | for code, unit_test, unit_test_output in pairs: 60 | # code = code.replace("\n", "\\n") 61 | unit_tests = f"{unit_test}{unit_test_output}" 62 | messages = [ 63 | {"role": "system", "content": "You are a helpful assistant."}, 64 | {"role": "user", "content": "solve plz"}, 65 | {"role": "assistant", "content": f"```python\n{code}```\n{unit_tests}"} 66 | ] 67 | 68 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct") 69 | inputs = tokenizer.apply_chat_template(messages, tokenize=False) 70 | 71 | payload = { 72 | "query": [inputs], 73 | "prompts": ["asdf"], 74 | "labels": [labels], 75 | "step": 2, 76 | "reward_config": { 77 | "n_steps": 200, 78 | "target_precision": 0.01, 79 | "warmup_steps": 10000, 80 | "max_time": 100000, 81 | "code_format":"pi_verifiable", 82 | "thinking_length_weight": 0.0 83 | } 84 | } 85 | 86 | url = "http://0.0.0.0:5432/get_reward" 87 | make_api_call(payload, url=url) 88 | 89 | -------------------------------------------------------------------------------- /experiments/u2_7B/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u2_7B 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 3e-6 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--colocate_all_models \ 82 | #--vllm_enable_sleep \ 83 | #--deepspeed_enable_sleep \ 84 | #--deepspeed_enable_super_sleep 85 | 86 | ray stop 87 | # --lora_alpha 128 \ 88 | # --lora_rank 64 89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 90 | #--flash_attn \ 91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 92 | #--vllm_sync_with_ray \ 93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 94 | -------------------------------------------------------------------------------- /experiments/u2_7B_3e-5/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u2_7B_3e-5 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 3e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--colocate_all_models \ 82 | #--vllm_enable_sleep \ 83 | #--deepspeed_enable_sleep \ 84 | #--deepspeed_enable_super_sleep 85 | 86 | ray stop 87 | # --lora_alpha 128 \ 88 | # --lora_rank 64 89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 90 | #--flash_attn \ 91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 92 | #--vllm_sync_with_ray \ 93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 94 | -------------------------------------------------------------------------------- /experiments/u2_7B_base_sft/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u2_7B_bs 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain /home/rosmine/data2/rl/u3_sft_1e-5_vllm/ \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--colocate_all_models \ 82 | #--vllm_enable_sleep \ 83 | #--deepspeed_enable_sleep \ 84 | #--deepspeed_enable_super_sleep 85 | 86 | ray stop 87 | # --lora_alpha 128 \ 88 | # --lora_rank 64 89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 90 | #--flash_attn \ 91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 92 | #--vllm_sync_with_ray \ 93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 94 | -------------------------------------------------------------------------------- /experiments/u2_7B_few_shot/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u2_7B_fs 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 3e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_few_shot \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--colocate_all_models \ 82 | #--vllm_enable_sleep \ 83 | #--deepspeed_enable_sleep \ 84 | #--deepspeed_enable_super_sleep 85 | 86 | ray stop 87 | # --lora_alpha 128 \ 88 | # --lora_rank 64 89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 90 | #--flash_attn \ 91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 92 | #--vllm_sync_with_ray \ 93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 94 | -------------------------------------------------------------------------------- /experiments/u1_7B/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u1_7B 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 4 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 4e-6 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train_with_unit_tests_v1 \ 42 | --input_key prompt \ 43 | --label_key verification_info \ 44 | --max_samples 100000 \ 45 | --normalize_reward \ 46 | --load_checkpoint \ 47 | --advantage_estimator group_norm \ 48 | --remote_rm_url http://localhost:5432/get_reward \ 49 | --use_tensorboard logs/${exp_name} \ 50 | --vllm_sync_backend nccl \ 51 | --enforce_eager \ 52 | --save_hf_ckpt \ 53 | --disable_ds_ckpt \ 54 | --RM_CONFIG_save_threshold 60 \ 55 | --RM_CONFIG_n_steps 100 \ 56 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 57 | --RM_CONFIG_aux_decay 1 \ 58 | --RM_CONFIG_n_trials 10 \ 59 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 60 | --RM_CONFIG_code_bleu_weight 0.5 \ 61 | --RM_CONFIG_timeout_seconds 120 \ 62 | --RM_CONFIG_aux_coef 0.0 \ 63 | --RM_CONFIG_aux_coef_warmup 0 \ 64 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 65 | --RM_CONFIG_batch_size 128 \ 66 | --RM_CONFIG_use_input_format_reward true \ 67 | --RM_CONFIG_code_format "pi_verifiable" \ 68 | --RM_CONFIG_max_time 10.0 \ 69 | --RM_CONFIG_response_begin_str "Please make sure the code is efficient." \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--colocate_all_models \ 82 | #--vllm_enable_sleep \ 83 | #--deepspeed_enable_sleep \ 84 | #--deepspeed_enable_super_sleep 85 | 86 | ray stop 87 | # --lora_alpha 128 \ 88 | # --lora_rank 64 89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 90 | #--flash_attn \ 91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 92 | #--vllm_sync_with_ray \ 93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 94 | -------------------------------------------------------------------------------- /experiments/u1_14B/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u1_14B 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 2 \ 19 | --vllm_tensor_parallel_size 1 \ 20 | --vllm_gpu_memory_utilization 0.90 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-14B-Instruct \ 24 | --save_steps 10 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 1 \ 28 | --train_batch_size 48 \ 29 | --micro_rollout_batch_size 2 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 3e-6 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train_with_unit_tests_v1 \ 42 | --input_key prompt \ 43 | --label_key verification_info \ 44 | --max_samples 100000 \ 45 | --normalize_reward \ 46 | --load_checkpoint \ 47 | --advantage_estimator group_norm \ 48 | --remote_rm_url http://localhost:5432/get_reward \ 49 | --use_tensorboard logs/${exp_name} \ 50 | --vllm_sync_backend nccl \ 51 | --enforce_eager \ 52 | --save_hf_ckpt \ 53 | --disable_ds_ckpt \ 54 | --RM_CONFIG_save_threshold 60 \ 55 | --RM_CONFIG_n_steps 100 \ 56 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 57 | --RM_CONFIG_aux_decay 1 \ 58 | --RM_CONFIG_n_trials 10 \ 59 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 60 | --RM_CONFIG_code_bleu_weight 0.5 \ 61 | --RM_CONFIG_timeout_seconds 120 \ 62 | --RM_CONFIG_aux_coef 0.0 \ 63 | --RM_CONFIG_aux_coef_warmup 0 \ 64 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 65 | --RM_CONFIG_batch_size 128 \ 66 | --RM_CONFIG_use_input_format_reward true \ 67 | --RM_CONFIG_code_format "pi_verifiable" \ 68 | --RM_CONFIG_max_time 10.0 \ 69 | --RM_CONFIG_response_begin_str "Please make sure the code is efficient." \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--colocate_all_models \ 82 | #--vllm_enable_sleep \ 83 | #--deepspeed_enable_sleep \ 84 | #--deepspeed_enable_super_sleep 85 | 86 | ray stop 87 | # --lora_alpha 128 \ 88 | # --lora_rank 64 89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 90 | #--flash_attn \ 91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 92 | #--vllm_sync_with_ray \ 93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 94 | -------------------------------------------------------------------------------- /experiments/u6_7B_sft/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u6_7B_sft 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain /home/rosmine/data2/rl/u3_sft_1e-5_vllm/ \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --early_stopping_steps 62 \ 56 | --RM_CONFIG_save_threshold 60 \ 57 | --RM_CONFIG_n_steps 100 \ 58 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 59 | --RM_CONFIG_aux_decay 1 \ 60 | --RM_CONFIG_n_trials 10 \ 61 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 62 | --RM_CONFIG_code_bleu_weight 0.5 \ 63 | --RM_CONFIG_timeout_seconds 120 \ 64 | --RM_CONFIG_aux_coef 0.0 \ 65 | --RM_CONFIG_aux_coef_warmup 0 \ 66 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 67 | --RM_CONFIG_batch_size 128 \ 68 | --RM_CONFIG_use_input_format_reward true \ 69 | --RM_CONFIG_code_format "pi_verifiable" \ 70 | --RM_CONFIG_max_time 10.0 \ 71 | --RM_CONFIG_thinking_length_weight 0 \ 72 | --entropy_coef 0.0 \ 73 | --num_episodes 1 \ 74 | --outlier_reward_filter -10.0 \ 75 | --ring_attn_size 4 \ 76 | --lora_alpha 128 \ 77 | --lora_rank 64 \ 78 | --packing_samples \ 79 | --flash_attn \ 80 | --adam_offload \ 81 | --lr_warmup_ratio 0.001 \ 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u6_7B_fs/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u6_7B_fs 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples_few_shot_with_reference \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --early_stopping_steps 62 \ 56 | --RM_CONFIG_save_threshold 60 \ 57 | --RM_CONFIG_n_steps 100 \ 58 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 59 | --RM_CONFIG_aux_decay 1 \ 60 | --RM_CONFIG_n_trials 10 \ 61 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 62 | --RM_CONFIG_code_bleu_weight 0.5 \ 63 | --RM_CONFIG_timeout_seconds 120 \ 64 | --RM_CONFIG_aux_coef 0.0 \ 65 | --RM_CONFIG_aux_coef_warmup 0 \ 66 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 67 | --RM_CONFIG_batch_size 128 \ 68 | --RM_CONFIG_use_input_format_reward true \ 69 | --RM_CONFIG_code_format "pi_verifiable" \ 70 | --RM_CONFIG_max_time 10.0 \ 71 | --RM_CONFIG_thinking_length_weight 0 \ 72 | --entropy_coef 0.0 \ 73 | --num_episodes 1 \ 74 | --outlier_reward_filter -10.0 \ 75 | --ring_attn_size 4 \ 76 | --lora_alpha 128 \ 77 | --lora_rank 64 \ 78 | --packing_samples \ 79 | --flash_attn \ 80 | --adam_offload \ 81 | --lr_warmup_ratio 0.001 \ 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u2_7B_gram/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u2_7B_gram 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u4_7B_gram/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u4_7B_gram 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u5_7B_gram/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u5_7B_gram 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u6_colo/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u6_colo 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 6 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 6 \ 18 | --vllm_num_engines 6 \ 19 | --vllm_tensor_parallel_size 1 \ 20 | --vllm_gpu_memory_utilization 0.7 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 12 \ 29 | --micro_rollout_batch_size 6 \ 30 | --rollout_batch_size 6 \ 31 | --n_samples_per_prompt 2 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 1024 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | --colocate_all_models \ 82 | --vllm_enable_sleep \ 83 | --deepspeed_enable_sleep \ 84 | #--grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u6_7B_form/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u6_7B_form 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | #--grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /experiments/u6_7B_gram/run.sh: -------------------------------------------------------------------------------- 1 | exp_name=u6_7B_gram 2 | 3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name} 4 | 5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name} 6 | 7 | echo "ray start" 8 | 9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF 10 | 11 | ray job submit --address="http://127.0.0.1:8265" \ 12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \ 13 | -- python3 -m openrlhf.cli.train_ppo_ray \ 14 | --ref_num_nodes 1 \ 15 | --ref_num_gpus_per_node 4 \ 16 | --actor_num_nodes 1 \ 17 | --actor_num_gpus_per_node 4 \ 18 | --vllm_num_engines 1 \ 19 | --vllm_tensor_parallel_size 2 \ 20 | --vllm_gpu_memory_utilization 0.95 \ 21 | --save_path ${DATA_DIR}/save_path \ 22 | --ckpt_path ${DATA_DIR}/checkpoint \ 23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \ 24 | --save_steps 1 \ 25 | --logging_steps 1 \ 26 | --eval_steps -1 \ 27 | --micro_train_batch_size 2 \ 28 | --train_batch_size 16 \ 29 | --micro_rollout_batch_size 4 \ 30 | --rollout_batch_size 32 \ 31 | --n_samples_per_prompt 16 \ 32 | --rm_batch_size 1 \ 33 | --max_epochs 2 \ 34 | --prompt_max_len 2048 \ 35 | --generate_max_len 2048 \ 36 | --zero_stage 2 \ 37 | --bf16 \ 38 | --actor_learning_rate 1e-5 \ 39 | --gradient_checkpointing \ 40 | --init_kl_coef 0.0 \ 41 | --apply_chat_template \ 42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \ 43 | --input_key prompt \ 44 | --label_key verification_info \ 45 | --max_samples 100000 \ 46 | --normalize_reward \ 47 | --load_checkpoint \ 48 | --advantage_estimator group_norm \ 49 | --remote_rm_url http://localhost:5432/get_reward \ 50 | --use_tensorboard logs/${exp_name} \ 51 | --vllm_sync_backend nccl \ 52 | --enforce_eager \ 53 | --save_hf_ckpt \ 54 | --disable_ds_ckpt \ 55 | --RM_CONFIG_save_threshold 60 \ 56 | --RM_CONFIG_n_steps 100 \ 57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \ 58 | --RM_CONFIG_aux_decay 1 \ 59 | --RM_CONFIG_n_trials 10 \ 60 | --RM_CONFIG_dfg_complexity_weight 0.5 \ 61 | --RM_CONFIG_code_bleu_weight 0.5 \ 62 | --RM_CONFIG_timeout_seconds 120 \ 63 | --RM_CONFIG_aux_coef 0.0 \ 64 | --RM_CONFIG_aux_coef_warmup 0 \ 65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \ 66 | --RM_CONFIG_batch_size 128 \ 67 | --RM_CONFIG_use_input_format_reward true \ 68 | --RM_CONFIG_code_format "pi_verifiable" \ 69 | --RM_CONFIG_max_time 10.0 \ 70 | --RM_CONFIG_thinking_length_weight 0 \ 71 | --entropy_coef 0.0 \ 72 | --num_episodes 1 \ 73 | --outlier_reward_filter -10.0 \ 74 | --ring_attn_size 4 \ 75 | --lora_alpha 128 \ 76 | --lora_rank 64 \ 77 | --packing_samples \ 78 | --flash_attn \ 79 | --adam_offload \ 80 | --lr_warmup_ratio 0.001 \ 81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt 82 | #--colocate_all_models \ 83 | #--vllm_enable_sleep \ 84 | #--deepspeed_enable_sleep \ 85 | #--deepspeed_enable_super_sleep 86 | 87 | ray stop 88 | # --lora_alpha 128 \ 89 | # --lora_rank 64 90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \ 91 | #--flash_attn \ 92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ 93 | #--vllm_sync_with_ray \ 94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \ 95 | -------------------------------------------------------------------------------- /unit_tests_server/verl_unit.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import pdb 4 | from datasets import load_from_disk 5 | from transformers import AutoTokenizer 6 | import concurrent.futures 7 | import pdb 8 | 9 | def hf_reward_fn(prompts, completions, **kwargs): 10 | # print(prompts, completions, kwargs) 11 | verification_info = kwargs["verification_info"] 12 | rewards = [] 13 | # Create payloads for all completions 14 | payloads = [] 15 | for i in range(len(completions)): 16 | payload = { 17 | "query": [completions[i]], 18 | "prompts": [prompts[i]], 19 | "labels": [verification_info[i]], 20 | "step": 2, 21 | "extra_info": kwargs, 22 | "reward_config": {"n_steps": 200, 23 | "target_precision": 0.01, 24 | "warmup_steps": 10000, 25 | "max_time": 5, 26 | "code_format":"pi_verifiable", 27 | "thinking_length_weight": 0.0, 28 | "use_input_format_reward": True} 29 | } 30 | payloads.append(payload) 31 | 32 | url = "http://0.0.0.0:5431/get_reward" 33 | 34 | # Process requests in parallel 35 | def process_request(payload): 36 | result = make_api_call(payload, url=url) 37 | return result['rewards'][0] 38 | 39 | with concurrent.futures.ThreadPoolExecutor() as executor: 40 | results = list(executor.map(process_request, payloads)) 41 | 42 | rewards.extend(results) 43 | print(f"returning rewards: {rewards}") 44 | return rewards 45 | 46 | def reward_fn(data_source, solution_str, ground_truth, extra_info=None): 47 | """ 48 | data_source: str 49 | solution_str: str 50 | ground_truth: str 51 | extra_info: dict 52 | """ 53 | payload = { 54 | "query": [solution_str], 55 | "prompts": [data_source], 56 | "labels": [ground_truth], 57 | "step": 2, 58 | "extra_info": extra_info, 59 | "reward_config": {"n_steps": 200, 60 | "target_precision": 0.01, 61 | "warmup_steps": 10000, 62 | "max_time": 5, 63 | "code_format":"pi_verifiable", 64 | "thinking_length_weight": 0.0, 65 | "use_input_format_reward": True} 66 | } 67 | 68 | url = "http://0.0.0.0:5431/get_reward" 69 | result = make_api_call(payload, url=url) 70 | return result 71 | 72 | 73 | def make_api_call(payload, url="http://0.0.0.0:5000/get_reward"): 74 | headers = { 75 | "Content-Type": "application/json" 76 | } 77 | response = requests.post(url, data=json.dumps(payload), headers=headers) 78 | 79 | if response.status_code == 200: 80 | response_json = response.json() 81 | # print(f"API call successful. Response: {response_json}") 82 | return response_json 83 | else: 84 | print(f"API call failed with status code {response.status_code}. Response:") 85 | print(response.text) 86 | return False 87 | 88 | if __name__ == "__main__": 89 | 90 | n_queries = 1 91 | n_calls = 2 92 | n_calls_parallel = 8 93 | 94 | correct_code = r""" 95 | a = int(input()) 96 | b = int(input()) 97 | print(a + b) 98 | """ 99 | 100 | incorrect_code = r""" 101 | a = int(input()) 102 | b = int(input()) 103 | print(a * b) 104 | """ 105 | 106 | unit_test = r"""1 107 | 2 108 | """ 109 | 110 | unit_test_correct_output = r"""3 111 | """ 112 | 113 | unit_test_incorrect_output = r"""2 114 | """ 115 | 116 | pairs = [ 117 | # (correct_code, unit_test, unit_test_correct_output), # 1 118 | # (incorrect_code, unit_test, unit_test_correct_output), # 0 119 | (incorrect_code, unit_test, unit_test_incorrect_output), # -1 120 | ] 121 | 122 | labels = "{'test_cases': [{'type': 'stdin_stdout', 'input': '1\n2', 'output': '3'}]}" 123 | 124 | for code, unit_test, unit_test_output in pairs: 125 | unit_tests = f"{unit_test}{unit_test_output}" 126 | messages = [ 127 | {"role": "system", "content": "You are a helpful assistant."}, 128 | {"role": "user", "content": "solve plz"}, 129 | {"role": "assistant", "content": f"{code}\n{unit_tests}"} 130 | ] 131 | 132 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct") 133 | inputs = tokenizer.apply_chat_template(messages, tokenize=False) 134 | 135 | payload = { 136 | "query": [inputs], 137 | "prompts": ["asdf"], 138 | "labels": [labels], 139 | "step": 2, 140 | "reward_config": { 141 | "n_steps": 200, 142 | "target_precision": 0.01, 143 | "warmup_steps": 10000, 144 | "max_time": 100000, 145 | "code_format":"pi_verifiable", 146 | "thinking_length_weight": 0.0 147 | } 148 | } 149 | 150 | url = "http://0.0.0.0:5432/get_reward" 151 | make_api_call(payload, url=url) 152 | 153 | -------------------------------------------------------------------------------- /unit_tests_server/train_unsloth.py: -------------------------------------------------------------------------------- 1 | from unsloth import FastLanguageModel 2 | import torch 3 | import yaml 4 | from datasets import load_dataset, load_from_disk 5 | from trl import GRPOConfig, GRPOTrainer 6 | 7 | import argparse 8 | 9 | import os 10 | import importlib.util 11 | import sys 12 | import pdb 13 | 14 | from unsloth_train_patch import train 15 | 16 | def load_and_call_function(file_path, function_name, *args, **kwargs): 17 | """ 18 | Loads a Python file from the given path and calls the specified function. 19 | 20 | Args: 21 | file_path (str): Path to the Python file 22 | function_name (str): Name of the function to call 23 | *args: Positional arguments to pass to the function 24 | **kwargs: Keyword arguments to pass to the function 25 | 26 | Returns: 27 | The result of the function call 28 | """ 29 | # Get the absolute path 30 | abs_path = os.path.abspath(file_path) 31 | 32 | # Check if file exists 33 | if not os.path.exists(abs_path): 34 | raise FileNotFoundError(f"File not found: {abs_path}") 35 | 36 | # Get the module name from the file path 37 | module_name = os.path.splitext(os.path.basename(abs_path))[0] 38 | 39 | # Load the module specification 40 | spec = importlib.util.spec_from_file_location(module_name, abs_path) 41 | if spec is None: 42 | raise ImportError(f"Could not load spec for module at {abs_path}") 43 | 44 | # Create the module 45 | module = importlib.util.module_from_spec(spec) 46 | 47 | # Add the module to sys.modules 48 | sys.modules[module_name] = module 49 | 50 | # Execute the module 51 | spec.loader.exec_module(module) 52 | 53 | # Check if the function exists in the module 54 | if not hasattr(module, function_name): 55 | raise AttributeError(f"Function '{function_name}' not found in {abs_path}") 56 | 57 | # Get the function 58 | function = getattr(module, function_name) 59 | 60 | # Call the function with the provided arguments 61 | return function 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser(description="Training script for Unsloth model") 65 | parser.add_argument("--config", '-c', type=str, help="Path to configuration file") 66 | return parser.parse_args() 67 | 68 | def main(args: argparse.Namespace): 69 | 70 | 71 | model, tokenizer = FastLanguageModel.from_pretrained( 72 | model_name=args.model_name, 73 | max_seq_length=args.max_seq_length, 74 | load_in_4bit=args.load_in_4bit, # False for LoRA 16bit 75 | load_in_8bit=args.load_in_8bit, # False for LoRA 16bit 76 | fast_inference=True, # Enable vLLM fast inference 77 | max_lora_rank=args.lora_rank, 78 | gpu_memory_utilization=0.6, # Reduce if out of memory 79 | ) 80 | 81 | model = FastLanguageModel.get_peft_model( 82 | model, 83 | r=args.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 84 | target_modules=[ 85 | "q_proj", 86 | "k_proj", 87 | "v_proj", 88 | "o_proj", 89 | "gate_proj", 90 | "up_proj", 91 | "down_proj", 92 | ], # Remove QKVO if out of memory 93 | lora_alpha=args.lora_alpha, 94 | use_gradient_checkpointing="unsloth", # Enable long context finetuning 95 | random_state=3407, 96 | ) 97 | 98 | dataset = load_from_disk(args.dataset_path) 99 | 100 | training_args = GRPOConfig( 101 | learning_rate=args.learning_rate, 102 | optim="paged_adamw_8bit", 103 | logging_steps=1, 104 | per_device_train_batch_size=args.per_device_train_batch_size, 105 | gradient_accumulation_steps=args.gradient_accumulation_steps, # Increase to 4 for smoother training 106 | num_generations=args.num_generations, # Decrease if out of memory 107 | max_prompt_length=args.max_prompt_length, 108 | max_completion_length=args.max_seq_length - args.max_prompt_length, 109 | # num_train_epochs = 1, # Set to 1 for a full training run 110 | max_steps=args.max_steps, 111 | save_steps=args.save_steps, 112 | report_to="tensorboard", # Can use Weights & Biases 113 | output_dir=args.output_dir, 114 | use_vllm=True, 115 | beta=0.001, 116 | # generation_batch_size=args.generation_batch_size, 117 | # steps_per_generation=args.steps_per_generation, 118 | ) 119 | 120 | 121 | # training_args.steps_per_generation = None 122 | 123 | reward_function = load_and_call_function(args.reward_function_path, args.reward_function_name) 124 | 125 | trainer = GRPOTrainer( 126 | model=model, 127 | processing_class=tokenizer, 128 | reward_funcs=[ 129 | reward_function, 130 | ], 131 | args=training_args, 132 | train_dataset=dataset, 133 | ) 134 | 135 | # train(trainer) 136 | trainer.train() 137 | 138 | model.save_pretrained(args.output_dir) 139 | 140 | if __name__ == "__main__": 141 | args = parse_args() 142 | with open(args.config, "r") as f: 143 | config = yaml.safe_load(f) 144 | for key, value in config.items(): 145 | setattr(args, key, value) 146 | main(args) -------------------------------------------------------------------------------- /unit_tests_server/convert_dataset_to_chat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Map: 0%| | 0/6065 [00:00\\n\\nLets enumerate rows and columns with integers from 1 to 8. Rows are numbered from top to bottom, while columns are numbered from left to right. Now we denote as (r, c) the cell located at the row r and at the column c.\\n\\nThere are always two players A and B playing the game. Player A plays with white pawns, while player B plays with black ones. The goal of player A is to put any of his pawns to the row 1, while player B tries to put any of his pawns to the row 8. As soon as any of the players completes his goal the game finishes immediately and the succeeded player is declared a winner.\\n\\nPlayer A moves first and then they alternate turns. On his move player A must choose exactly one white pawn and move it one step upward and player B (at his turn) must choose exactly one black pawn and move it one step down. Any move is possible only if the targeted cell is empty. It's guaranteed that for any scenario of the game there will always be at least one move available for any of the players.\\n\\nMoving upward means that the pawn located in (r, c) will go to the cell (r - 1, c), while moving down means the pawn located in (r, c) will go to the cell (r + 1, c). Again, the corresponding cell must be empty, i.e. not occupied by any other pawn of any color.\\n\\nGiven the initial disposition of the board, determine who wins the game if both players play optimally. Note that there will always be a winner due to the restriction that for any game scenario both players will have some moves available.\\n\\nInput\\n\\nThe input consists of the board description given in eight lines, each line contains eight characters. Character 'B' is used to denote a black pawn, and character 'W' represents a white pawn. Empty cell is marked with '.'. \\n\\nIt's guaranteed that there will not be white pawns on the first row neither black pawns on the last row.\\n\\nOutput\\n\\nPrint 'A' if player A wins the game on the given board, and 'B' if player B will claim the victory. Again, it's guaranteed that there will always be a winner on the given board.\\nNote\\n\\nIn the first sample player A is able to complete his goal in 3 steps by always moving a pawn initially located at (4, 5). Player B needs at least 5 steps for any of his pawns to reach the row 8. Hence, player A will be the winner.\\n\\nThe input will be given via stdin and the output should be printed to stdout by your code.\\n\\nNow solve the problem by providing the code.\\nReturn the code within ```python ... ``` markup. Write unit tests for the code with the standard input/output like this: (stdin)(stdout) tags. If you make multiple unit tests, use the ... ... tags to write multiple unit tests one after another.\",\n", 56 | " 'role': 'user'}]}" 57 | ] 58 | }, 59 | "execution_count": 4, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "dataset[0]['prompt']" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "transformers", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.11.0" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==2.1.0 2 | accelerate==1.3.0 3 | aiohappyeyeballs==2.6.1 4 | aiohttp==3.11.18 5 | aiohttp-cors==0.7.0 6 | aiosignal==1.3.2 7 | airportsdata==20241001 8 | annotated-types==0.7.0 9 | anyio==4.9.0 10 | argon2-cffi==23.1.0 11 | argon2-cffi-bindings==21.2.0 12 | arrow==1.3.0 13 | astor==0.8.1 14 | asttokens==3.0.0 15 | async-lru==2.0.4 16 | attrs==25.3.0 17 | babel==2.17.0 18 | beautifulsoup4==4.13.3 19 | bitsandbytes==0.45.2 20 | blake3==1.0.4 21 | bleach==6.2.0 22 | cachetools==5.5.1 23 | cairocffi==1.7.1 24 | CairoSVG==2.7.1 25 | certifi==2025.1.31 26 | cffi==1.17.1 27 | charset-normalizer==3.4.1 28 | click==8.1.8 29 | -e git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1#egg=clip 30 | cloudpickle==3.1.1 31 | cmake==4.0.0 32 | colorful==0.5.6 33 | comm==0.2.2 34 | compressed-tensors==0.9.1 35 | contourpy==1.3.1 36 | cssselect2==0.8.0 37 | cupy-cuda12x==13.3.0 38 | cycler==0.12.1 39 | datasets==3.5.0 40 | debugpy==1.8.12 41 | decorator==5.1.1 42 | deepspeed==0.16.7 43 | deepspeed-kernels==0.0.1.dev1698255861 44 | defusedxml==0.7.1 45 | Deprecated==1.2.18 46 | depyf==0.18.0 47 | dill==0.3.8 48 | diskcache==5.6.3 49 | distlib==0.3.9 50 | distro==1.9.0 51 | dnspython==2.7.0 52 | docker-pycreds==0.4.0 53 | einops==0.8.1 54 | email_validator==2.2.0 55 | executing==2.2.0 56 | fastapi==0.115.12 57 | fastapi-cli==0.0.7 58 | fastjsonschema==2.21.1 59 | fastrlock==0.8.3 60 | filelock==3.18.0 61 | flash-attn==2.7.0.post2 62 | fonttools==4.56.0 63 | fqdn==1.5.1 64 | frozenlist==1.6.0 65 | fsspec==2024.12.0 66 | ftfy==6.3.1 67 | gguf==0.10.0 68 | gitdb==4.0.12 69 | GitPython==3.1.44 70 | google-api-core==2.24.1 71 | google-auth==2.38.0 72 | googleapis-common-protos==1.67.0 73 | grpcio==1.70.0 74 | h11==0.14.0 75 | hf-xet==1.0.3 76 | hjson==3.1.0 77 | httpcore==1.0.8 78 | httptools==0.6.4 79 | httpx==0.28.1 80 | huggingface-hub==0.30.2 81 | idna==3.10 82 | importlib_metadata==8.0.0 83 | iniconfig==2.0.0 84 | interegular==0.3.3 85 | ipykernel==6.29.5 86 | ipython==8.32.0 87 | ipywidgets==8.1.5 88 | isoduration==20.11.0 89 | isort==6.0.0 90 | jedi==0.19.2 91 | Jinja2==3.1.6 92 | jiter==0.9.0 93 | joblib==1.4.2 94 | json5==0.10.0 95 | jsonlines==4.0.0 96 | jsonpointer==3.0.0 97 | jsonschema==4.23.0 98 | jsonschema-specifications==2025.4.1 99 | jupyter==1.1.1 100 | jupyter-console==6.6.3 101 | jupyter-events==0.12.0 102 | jupyter-lsp==2.2.5 103 | jupyter_client==8.6.3 104 | jupyter_core==5.7.2 105 | jupyter_server==2.15.0 106 | jupyter_server_terminals==0.5.3 107 | jupyterlab==4.3.5 108 | jupyterlab_pygments==0.3.0 109 | jupyterlab_server==2.27.3 110 | jupyterlab_widgets==3.0.13 111 | kiwisolver==1.4.8 112 | lark==1.2.2 113 | liger_kernel==0.5.6 114 | lightning-utilities==0.12.0 115 | llguidance==0.7.13 116 | llvmlite==0.43.0 117 | lm-format-enforcer==0.10.11 118 | loralib==0.1.2 119 | Markdown==3.7 120 | markdown-it-py==3.0.0 121 | MarkupSafe==3.0.2 122 | matplotlib==3.10.0 123 | matplotlib-inline==0.1.7 124 | mdurl==0.1.2 125 | mistral_common==1.5.4 126 | mistune==3.1.2 127 | mnist1d==0.0.2.post1 128 | mpi4py==4.0.2 129 | mpmath==1.3.0 130 | msgpack==1.1.0 131 | msgspec==0.19.0 132 | multidict==6.4.3 133 | multiprocess==0.70.16 134 | munkres==1.1.4 135 | nanobind==2.6.1 136 | nbclient==0.10.2 137 | nbconvert==7.16.6 138 | nbformat==5.10.4 139 | nest-asyncio==1.6.0 140 | networkx==3.4.2 141 | ninja==1.11.1.3 142 | nltk==3.9.1 143 | notebook==7.3.2 144 | notebook_shim==0.2.4 145 | numba==0.60.0 146 | numpy==1.26.4 147 | nvidia-cublas-cu11==11.11.3.6 148 | nvidia-cublas-cu12==12.4.5.8 149 | nvidia-cuda-cupti-cu11==11.8.87 150 | nvidia-cuda-cupti-cu12==12.4.127 151 | nvidia-cuda-nvrtc-cu11==11.8.89 152 | nvidia-cuda-nvrtc-cu12==12.4.127 153 | nvidia-cuda-runtime-cu11==11.8.89 154 | nvidia-cuda-runtime-cu12==12.4.127 155 | nvidia-cudnn-cu11==9.1.0.70 156 | nvidia-cudnn-cu12==9.1.0.70 157 | nvidia-cufft-cu11==10.9.0.58 158 | nvidia-cufft-cu12==11.2.1.3 159 | nvidia-cufile-cu12==1.11.1.6 160 | nvidia-curand-cu11==10.3.0.86 161 | nvidia-curand-cu12==10.3.5.147 162 | nvidia-cusolver-cu11==11.4.1.48 163 | nvidia-cusolver-cu12==11.6.1.9 164 | nvidia-cusparse-cu11==11.7.5.86 165 | nvidia-cusparse-cu12==12.3.1.170 166 | nvidia-cusparselt-cu12==0.6.2 167 | nvidia-ml-py==12.570.86 168 | nvidia-nccl-cu11==2.21.5 169 | nvidia-nccl-cu12==2.21.5 170 | nvidia-nvjitlink-cu12==12.4.127 171 | nvidia-nvtx-cu11==11.8.86 172 | nvidia-nvtx-cu12==12.4.127 173 | openai==1.76.0 174 | opencensus==0.11.4 175 | opencensus-context==0.1.3 176 | opencv-python-headless==4.11.0.86 177 | -e git+https://github.com/rosmineb/OpenRLHF.git@047bbfaa0d29063801720b961b64d068ae2ea7cd#egg=openrlhf 178 | opentelemetry-api==1.26.0 179 | opentelemetry-exporter-otlp==1.26.0 180 | opentelemetry-exporter-otlp-proto-common==1.26.0 181 | opentelemetry-exporter-otlp-proto-grpc==1.26.0 182 | opentelemetry-exporter-otlp-proto-http==1.26.0 183 | opentelemetry-proto==1.26.0 184 | opentelemetry-sdk==1.26.0 185 | opentelemetry-semantic-conventions==0.47b0 186 | opentelemetry-semantic-conventions-ai==0.4.3 187 | optimum==1.24.0 188 | outlines==0.1.11 189 | outlines_core==0.1.26 190 | overrides==7.7.0 191 | packaging==25.0 192 | pandas==2.2.3 193 | pandocfilters==1.5.1 194 | parso==0.8.4 195 | partial-json-parser==0.2.1.1.post5 196 | peft==0.14.0 197 | pexpect==4.9.0 198 | pillow==11.2.1 199 | platformdirs==4.3.6 200 | pluggy==1.5.0 201 | prometheus-fastapi-instrumentator==7.1.0 202 | prometheus_client==0.21.1 203 | prompt_toolkit==3.0.50 204 | propcache==0.3.1 205 | proto-plus==1.26.0 206 | protobuf==4.25.7 207 | psutil==7.0.0 208 | ptyprocess==0.7.0 209 | pure_eval==0.2.3 210 | py-cpuinfo==9.0.0 211 | py-spy==0.4.0 212 | pyairports==2.1.1 213 | pyarrow==19.0.1 214 | pyasn1==0.6.1 215 | pyasn1_modules==0.4.1 216 | pybind11==2.13.6 217 | pycountry==24.6.1 218 | pycparser==2.22 219 | pydantic==2.11.3 220 | pydantic_core==2.33.1 221 | Pygments==2.19.1 222 | pynvml==12.0.0 223 | pyparsing==3.2.1 224 | pytest==8.3.4 225 | python-dateutil==2.9.0.post0 226 | python-dotenv==1.1.0 227 | python-json-logger==3.2.1 228 | python-multipart==0.0.20 229 | pytz==2025.2 230 | PyYAML==6.0.2 231 | pyzmq==26.4.0 232 | RapidFuzz==3.12.1 233 | ray==2.40.0 234 | referencing==0.36.2 235 | regex==2024.11.6 236 | requests==2.32.3 237 | rfc3339-validator==0.1.4 238 | rfc3986-validator==0.1.1 239 | rich==13.9.4 240 | rich-toolkit==0.13.2 241 | rpds-py==0.24.0 242 | rsa==4.9 243 | safetensors==0.5.3 244 | scipy==1.15.2 245 | Send2Trash==1.8.3 246 | sentencepiece==0.2.0 247 | sentry-sdk==2.21.0 248 | setproctitle==1.3.4 249 | shellingham==1.5.4 250 | six==1.17.0 251 | smart-open==7.1.0 252 | smmap==5.0.2 253 | sniffio==1.3.1 254 | soupsieve==2.6 255 | stack-data==0.6.3 256 | starlette==0.46.2 257 | sympy==1.13.1 258 | tensorboard==2.19.0 259 | tensorboard-data-server==0.7.2 260 | terminado==0.18.1 261 | thefuzz==0.22.1 262 | tiktoken==0.9.0 263 | tinycss2==1.4.0 264 | tokenizers==0.21.1 265 | torch==2.5.1 266 | torchaudio==2.5.1 267 | torchmetrics==1.6.1 268 | torchvision==0.20.1 269 | tornado==6.4.2 270 | tqdm==4.67.1 271 | traitlets==5.14.3 272 | transformers==4.51.3 273 | transformers-stream-generator==0.0.5 274 | tree-sitter==0.24.0 275 | tree-sitter-python==0.23.6 276 | triton==3.1.0 277 | typer==0.15.1 278 | types-python-dateutil==2.9.0.20241206 279 | typing-inspection==0.4.0 280 | typing_extensions==4.13.2 281 | tzdata==2025.2 282 | uri-template==1.3.0 283 | urllib3==2.4.0 284 | uvicorn==0.34.2 285 | uvloop==0.21.0 286 | virtualenv==20.29.2 287 | vllm==0.7.3 288 | wandb==0.19.6 289 | watchfiles==1.0.5 290 | wcwidth==0.2.13 291 | webcolors==24.11.1 292 | webencodings==0.5.1 293 | websocket-client==1.8.0 294 | websockets==15.0.1 295 | Werkzeug==3.1.3 296 | widgetsnbextension==4.0.13 297 | wrapt==1.17.2 298 | xformers==0.0.28.post3 299 | xgrammar==0.1.11 300 | xxhash==3.5.0 301 | yarl==1.20.0 302 | zipp==3.21.0 303 | -------------------------------------------------------------------------------- /unit_tests_server/unsloth_train_patch.py: -------------------------------------------------------------------------------- 1 | # stolen from ART https://github.com/OpenPipe/ART/blob/5f3dea20069ee8e4afbd482e529df5ee80d81b81/src/art/local/train.py 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from typing import Callable, Dict, List, Optional, Tuple, Union 7 | 8 | from trl import GRPOTrainer 9 | from peft import PeftModel 10 | import numpy as np 11 | import os 12 | 13 | import pdb 14 | 15 | torch_compile_options = { 16 | "epilogue_fusion" : True, 17 | "max_autotune" : False, 18 | "shape_padding" : True, 19 | "trace.enabled" : False, 20 | "triton.cudagraphs" : False, 21 | } 22 | 23 | def train( 24 | trainer: "GRPOTrainer", 25 | ) -> None: 26 | _compute_loss = trainer.compute_loss 27 | trainer.compute_loss = lambda *args, **kwargs: compute_loss(trainer, *args, **kwargs) 28 | # trainer.log = get_log_fn(trainer, results_queue) 29 | try: 30 | trainer.train() 31 | finally: 32 | trainer.compute_loss = _compute_loss 33 | # trainer.log = _log 34 | 35 | @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options) 36 | def grpo_compute_loss_slow(old_logits, new_logits, input_ids, mask, beta, advantages): 37 | # All Unsloth Zoo code licensed under LGPLv3 38 | input_ids = input_ids.unsqueeze(-1) 39 | new_logits = new_logits.to(torch.float32) 40 | new_x = torch.gather(new_logits, dim = -1, index = input_ids).squeeze(-1) 41 | new = new_x - torch.logsumexp(new_logits, dim = -1) 42 | 43 | if old_logits is None: 44 | kl_i = torch.zeros_like(mask) 45 | else: 46 | old_logits = old_logits.to(torch.float32) 47 | 48 | # x_i - logsumexp(x_i) 49 | old_x = torch.gather(old_logits, dim = -1, index = input_ids).squeeze(-1) 50 | old = old_x - torch.logsumexp(old_logits, dim = -1) 51 | 52 | # Reverse KL 53 | kl_i = torch.exp(old - new) - (old - new) - 1.0 54 | # Full correct reverse KL divergence?? Missing term maybe? 55 | # kl_i = torch.exp(new) * kl_i 56 | 57 | # Below is forward KL (normal KL) 58 | # kl_i = torch.exp(old) * (old - new) 59 | 60 | # Must detach - otherwise gradients are not propagated correctly! 61 | # exp(x - x) == 1 62 | loss_i = torch.exp(new - new.detach()) * advantages.unsqueeze(1) 63 | loss_i = -(loss_i - beta * kl_i) 64 | 65 | mask = mask.to(torch.float32) 66 | n_mask_per_reward = mask.sum(1) 67 | 68 | # See https://github.com/huggingface/trl/pull/2881 69 | loss_per_reward = (loss_i * mask).sum(1) / n_mask_per_reward 70 | loss = loss_per_reward.mean() 71 | # loss = (loss_i * mask).sum() / mask.sum() 72 | 73 | # Get metrics as well which are folded 74 | with torch.inference_mode(): 75 | completion_length = n_mask_per_reward.mean() 76 | mean_kl_per_reward = (kl_i * mask).sum(1) / n_mask_per_reward 77 | mean_kl = mean_kl_per_reward.mean() 78 | 79 | return loss, completion_length, 80 | 81 | def grpo_accumulated_loss( 82 | trainer, 83 | input_ids, 84 | logits_to_keep, 85 | completion_mask, 86 | advantages, 87 | n_chunks = -1, 88 | ): 89 | # All Unsloth Zoo code licensed under LGPLv3 90 | bsz, qlen = input_ids.shape 91 | # Find closest multiple 92 | factors = [i for i in range(1, bsz + 1) if bsz % i == 0] 93 | if n_chunks == -1: n_chunks = bsz 94 | n_chunks = factors[min(np.searchsorted(factors, n_chunks), len(factors)-1)] 95 | 96 | mixed_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16 97 | os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1" 98 | 99 | completion_input_ids = input_ids[:, -logits_to_keep:] 100 | lm_head = trainer.model.get_output_embeddings().weight 101 | 102 | with torch.amp.autocast(device_type = "cuda", dtype = mixed_dtype): 103 | with torch.inference_mode(), trainer.accelerator.unwrap_model(trainer.model, keep_fp32_wrapper = False).disable_adapter(): 104 | old_hidden_states = trainer.model(input_ids = input_ids, logits_to_keep = logits_to_keep + 1).logits 105 | pass 106 | 107 | new_hidden_states = trainer.model(input_ids = input_ids, logits_to_keep = logits_to_keep + 1).logits 108 | 109 | loss, completion_length, mean_kl = UnslothEfficientGRPO.apply( 110 | new_hidden_states, old_hidden_states, lm_head, 111 | completion_input_ids, completion_mask, advantages, trainer.beta, 112 | trainer.accelerator.scaler, 113 | n_chunks, 114 | ) 115 | return loss, completion_length, mean_kl 116 | 117 | # Old non efficient code path 118 | new_logits = torch.matmul(new_hidden_states, lm_head.t()) 119 | new_logits = new_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred 120 | old_logits = torch.matmul(old_hidden_states, lm_head.t()) 121 | old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred 122 | loss, completion_length, mean_kl = grpo_compute_loss( 123 | old_logits, new_logits, completion_input_ids, completion_mask, trainer.beta, advantages, 124 | ) 125 | return loss, completion_length, mean_kl 126 | pass 127 | 128 | def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None): 129 | if return_outputs: 130 | raise ValueError("The GRPOTrainer does not support returning outputs") 131 | # Compute the per-token log probabilities for the model 132 | 133 | prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] 134 | completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] 135 | input_ids = torch.cat([prompt_ids, completion_ids], dim=1) 136 | bsz, qlen = input_ids.shape 137 | attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) 138 | # attention_mask = None 139 | logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens 140 | _input_ids = input_ids 141 | _logits_to_keep = logits_to_keep 142 | 143 | per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep) 144 | 145 | # Compute the KL divergence between the model and the reference model 146 | ref_per_token_logps = inputs["ref_per_token_logps"] if "ref_per_token_logps" in inputs else None 147 | # per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 148 | 149 | # x - x.detach() allows for preserving gradients from x 150 | advantages = inputs["advantages"] 151 | # per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1) 152 | # per_token_loss = -(per_token_loss - self.beta * per_token_kl) 153 | # loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() 154 | input_ids = input_ids[:, -logits_to_keep:] 155 | if per_token_logps is not None: 156 | loss, completion_length, mean_kl = grpo_compute_loss_slow( 157 | ref_per_token_logps, per_token_logps, input_ids, completion_mask, self.beta, advantages, 158 | ) 159 | else: 160 | loss, completion_length, mean_kl = grpo_accumulated_loss( 161 | self, _input_ids, logits_to_keep, completion_mask, advantages, 162 | n_chunks = self.args.unsloth_num_chunks, 163 | ) 164 | 165 | # Log the metrics 166 | # completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item() 167 | 168 | # mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() 169 | # self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) 170 | 171 | if "train" in self._metrics: 172 | mode = "eval" if self.control.should_evaluate else "train" 173 | self._metrics[mode]["completion_length"].append(completion_length.item()) 174 | self._metrics[mode]["kl"].append(mean_kl.item()) 175 | else: 176 | self._metrics["completion_length"].append(completion_length.item()) 177 | self._metrics["kl"].append(mean_kl.item()) 178 | return loss 179 | --------------------------------------------------------------------------------