├── .gitignore
├── experiments
├── unsloth
│ ├── v1_7B_dev
│ │ ├── .config.yaml.swp
│ │ └── config.yaml
│ └── v1_dev
│ │ └── config.yaml
├── grammar
│ ├── sample_output.txt
│ ├── unit_test_grammar.txt
│ ├── unit_test_grammar.txt.bak
│ └── unit_test_grammar.txt_old
├── vllm_config.yaml
├── u3_sft_7B
│ └── run.sh
├── u3_sft_1e-4
│ └── run.sh
├── u3_sft_1e-5
│ └── run.sh
├── u3_sft_1e-6
│ └── run.sh
├── u3_sft_3e-4
│ └── run.sh
├── u3_sft_3e-5
│ └── run.sh
├── u1_7B
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u2_7B
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u1_14B
│ ├── eval_50_config.yaml
│ ├── eval_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u2_7B_3e-5
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u2_7B_gram
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u4_7B_gram
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u5_7B_gram
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u6_7B_form
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u6_7B_fs
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_train_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ └── run.sh
├── u6_7B_gram
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ └── run.sh
├── u6_7B_sft
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_train_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ └── run.sh
├── u6_colo
│ ├── eval_50_config.yaml
│ ├── eval_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_train_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ └── run.sh
├── u2_7B_base_sft
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_train_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ └── run.sh
└── u2_7B_few_shot
│ ├── eval_config.yaml
│ ├── eval_50_config.yaml
│ ├── eval_pi_50_config.yaml
│ ├── eval_pi_config.yaml
│ ├── eval_pi_train_config.yaml
│ ├── eval_pi_train_50_config.yaml
│ └── run.sh
├── unit_tests_server
├── add_reference_code.ipynb
├── convert_hf_to_verl_dataset.py
├── check_grammar.py
├── test_unit_test.py
├── verl_unit.py
├── train_unsloth.py
├── convert_dataset_to_chat.ipynb
└── unsloth_train_patch.py
├── README.md
├── check_grammar.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | unit_tests_server/__pycache__/
3 |
--------------------------------------------------------------------------------
/experiments/unsloth/v1_7B_dev/.config.yaml.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosmineb/unit_test_rl/HEAD/experiments/unsloth/v1_7B_dev/.config.yaml.swp
--------------------------------------------------------------------------------
/experiments/grammar/sample_output.txt:
--------------------------------------------------------------------------------
1 | Here is the Python code to solve the problem:
2 |
3 | ```python
4 | def sum():
5 | a, b = map(int, input().split())
6 | return a + b
7 | ```
8 | ### Unit Tests
9 |
10 | 1 13
11 | -1 10<|im_end|>
12 |
--------------------------------------------------------------------------------
/experiments/grammar/unit_test_grammar.txt:
--------------------------------------------------------------------------------
1 | root ::= intro codeBlock testBlock (end)*
2 | intro ::= [^`#]*
3 | newline ::= "\n"
4 | codeBlock ::= "```python" newline [^`]* "```" newline
5 | testBlock ::= testIntro (testLine)*
6 | testIntro ::= "### Unit Tests" newline newline
7 | notBracket ::= [^<]*
8 | testLine ::= "" notBracket "" notBracket "" (newline)*
9 | end ::= "<|im_end|>"
10 |
11 |
--------------------------------------------------------------------------------
/experiments/grammar/unit_test_grammar.txt.bak:
--------------------------------------------------------------------------------
1 | root ::= intro codeBlock testBlock (end)*
2 | intro ::= [^`#]*
3 | newline ::= "\n"
4 | codeBlock ::= "```python" newline [^`#]* "```" newline
5 | testBlock ::= testIntro (testLine)*
6 | testIntro ::= "### Unit Tests" newline newline
7 | notBracket ::= [^<]*
8 | testLine ::= "" notBracket "" notBracket "" (newline)*
9 | end ::= "<|im_end|>"
10 |
11 |
--------------------------------------------------------------------------------
/experiments/vllm_config.yaml:
--------------------------------------------------------------------------------
1 | label_field: "verification_info"
2 | base_model_name: "Qwen/Qwen2.5-Coder-7B-Instruct"
3 | device_groups:
4 | - 1
5 | - 2
6 | - 3
7 | - 4
8 | - 5
9 | - 6
10 | vllm_config:
11 | gpu_memory_utilization: 0.9
12 | max_model_len: 12288
13 | max_num_batched_tokens: 98304
14 | generation_num_copies: 1
15 | temperature: 1.0
16 | repetition_penalty: 0.0
17 | max_tokens: 12288
18 | tensor_parallel_size: 1
19 | batch_size: 200
20 |
--------------------------------------------------------------------------------
/unit_tests_server/add_reference_code.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | }
10 | ],
11 | "metadata": {
12 | "kernelspec": {
13 | "display_name": "transformers",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "name": "python",
19 | "version": "3.11.0"
20 | }
21 | },
22 | "nbformat": 4,
23 | "nbformat_minor": 2
24 | }
25 |
--------------------------------------------------------------------------------
/experiments/grammar/unit_test_grammar.txt_old:
--------------------------------------------------------------------------------
1 | root ::= intro codeBlock testBlock (end)*
2 | intro ::= "Here is the Python code to solve the problem:" newline newline
3 | newline ::= "\n"
4 | codeBlock ::= "```python\n" pythonBlock "```" newline newline
5 | pythonBlock ::= (pythonLine)*
6 | pythonLine ::= [^\n]* newline
7 | testBlock ::= testIntro (testLine)*
8 | testIntro ::= "### Unit Tests" newline newline
9 | testLine ::= "[^<]*[^<]*" (newline)*
10 | end ::= "<|im_end|>"
11 |
12 |
--------------------------------------------------------------------------------
/experiments/unsloth/v1_dev/config.yaml:
--------------------------------------------------------------------------------
1 | model_name: "Qwen/Qwen2.5-Coder-1.5B-Instruct"
2 | load_in_4bit: false
3 | load_in_8bit: false
4 | max_seq_length: 4096
5 | lora_rank: 64
6 | lora_alpha: 128
7 | learning_rate: 1.0e-5
8 | num_generations: 2
9 | gradient_accumulation_steps: 4
10 | max_prompt_length: 2048
11 | max_steps: 1000
12 | save_steps: 5
13 | output_dir: /home/rosmine/data2/rl/v1_dev
14 | vllm_mode: "colocate"
15 | dataset_path: /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples_chat
16 | per_device_train_batch_size: 2
17 | generation_batch_size: 24
18 | reward_function_path: "./unit_tests_server/verl_unit.py"
19 | reward_function_name: "hf_reward_fn"
20 |
--------------------------------------------------------------------------------
/experiments/unsloth/v1_7B_dev/config.yaml:
--------------------------------------------------------------------------------
1 | model_name: "Qwen/Qwen2.5-Coder-7B-Instruct"
2 | load_in_4bit: true
3 | load_in_8bit: false
4 | max_seq_length: 2560
5 | lora_rank: 64
6 | lora_alpha: 128
7 | learning_rate: 1.0e-5
8 | num_generations: 16
9 | #generation_batch_size: 512
10 | steps_per_generation: 4
11 | gradient_accumulation_steps: 4
12 | max_prompt_length: 2048
13 | max_steps: 1000
14 | save_steps: 5
15 | output_dir: /home/rosmine/data2/rl/v1_dev
16 | vllm_mode: "colocate"
17 | dataset_path: /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples_chat
18 | per_device_train_batch_size: 16
19 | reward_function_path: "./unit_tests_server/verl_unit.py"
20 | reward_function_name: "hf_reward_fn"
21 |
--------------------------------------------------------------------------------
/experiments/u3_sft_7B/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u3_sft_7B
2 |
3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
4 |
5 | deepspeed --module openrlhf.cli.train_sft \
6 | --max_len 2048 \
7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \
8 | --input_key prompt \
9 | --output_key response \
10 | --train_batch_size 48 \
11 | --apply_chat_template \
12 | --micro_train_batch_size 2 \
13 | --max_samples 500000 \
14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \
16 | --save_steps 5 \
17 | --logging_steps 1 \
18 | --eval_steps -1 \
19 | --zero_stage 2 \
20 | --max_epochs 3 \
21 | --bf16 \
22 | --gradient_checkpointing \
23 | --flash_attn \
24 | --learning_rate 3e-6 \
25 | --lora_rank 64 \
26 | --lora_alpha 128 \
27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \
28 |
29 |
--------------------------------------------------------------------------------
/experiments/u3_sft_1e-4/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u3_sft_1e-4
2 |
3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
4 |
5 | deepspeed --module openrlhf.cli.train_sft \
6 | --max_len 2048 \
7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \
8 | --input_key prompt \
9 | --output_key response \
10 | --train_batch_size 48 \
11 | --apply_chat_template \
12 | --micro_train_batch_size 2 \
13 | --max_samples 500000 \
14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \
16 | --save_steps 5 \
17 | --logging_steps 1 \
18 | --eval_steps -1 \
19 | --zero_stage 2 \
20 | --max_epochs 3 \
21 | --bf16 \
22 | --gradient_checkpointing \
23 | --flash_attn \
24 | --learning_rate 1e-4 \
25 | --lora_rank 64 \
26 | --lora_alpha 128 \
27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \
28 |
29 |
--------------------------------------------------------------------------------
/experiments/u3_sft_1e-5/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u3_sft_1e-5
2 |
3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
4 |
5 | deepspeed --module openrlhf.cli.train_sft \
6 | --max_len 2048 \
7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \
8 | --input_key prompt \
9 | --output_key response \
10 | --train_batch_size 48 \
11 | --apply_chat_template \
12 | --micro_train_batch_size 2 \
13 | --max_samples 500000 \
14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \
16 | --save_steps 5 \
17 | --logging_steps 1 \
18 | --eval_steps -1 \
19 | --zero_stage 2 \
20 | --max_epochs 3 \
21 | --bf16 \
22 | --gradient_checkpointing \
23 | --flash_attn \
24 | --learning_rate 1e-5 \
25 | --lora_rank 64 \
26 | --lora_alpha 128 \
27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \
28 |
29 |
--------------------------------------------------------------------------------
/experiments/u3_sft_1e-6/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u3_sft_1e-6
2 |
3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
4 |
5 | deepspeed --module openrlhf.cli.train_sft \
6 | --max_len 2048 \
7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \
8 | --input_key prompt \
9 | --output_key response \
10 | --train_batch_size 48 \
11 | --apply_chat_template \
12 | --micro_train_batch_size 2 \
13 | --max_samples 500000 \
14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \
16 | --save_steps 5 \
17 | --logging_steps 1 \
18 | --eval_steps -1 \
19 | --zero_stage 2 \
20 | --max_epochs 3 \
21 | --bf16 \
22 | --gradient_checkpointing \
23 | --flash_attn \
24 | --learning_rate 1e-6 \
25 | --lora_rank 64 \
26 | --lora_alpha 128 \
27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \
28 |
29 |
--------------------------------------------------------------------------------
/experiments/u3_sft_3e-4/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u3_sft_3e-4
2 |
3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
4 |
5 | deepspeed --module openrlhf.cli.train_sft \
6 | --max_len 2048 \
7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \
8 | --input_key prompt \
9 | --output_key response \
10 | --train_batch_size 48 \
11 | --apply_chat_template \
12 | --micro_train_batch_size 2 \
13 | --max_samples 500000 \
14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \
16 | --save_steps 5 \
17 | --logging_steps 1 \
18 | --eval_steps -1 \
19 | --zero_stage 2 \
20 | --max_epochs 3 \
21 | --bf16 \
22 | --gradient_checkpointing \
23 | --flash_attn \
24 | --learning_rate 3e-4 \
25 | --lora_rank 64 \
26 | --lora_alpha 128 \
27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \
28 |
29 |
--------------------------------------------------------------------------------
/experiments/u3_sft_3e-5/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u3_sft_3e-5
2 |
3 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
4 |
5 | deepspeed --module openrlhf.cli.train_sft \
6 | --max_len 2048 \
7 | --dataset /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_sft_formatted \
8 | --input_key prompt \
9 | --output_key response \
10 | --train_batch_size 48 \
11 | --apply_chat_template \
12 | --micro_train_batch_size 2 \
13 | --max_samples 500000 \
14 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
15 | --save_path /home/rosmine/data2/rl/${exp_name}/checkpoint \
16 | --save_steps 5 \
17 | --logging_steps 1 \
18 | --eval_steps -1 \
19 | --zero_stage 2 \
20 | --max_epochs 3 \
21 | --bf16 \
22 | --gradient_checkpointing \
23 | --flash_attn \
24 | --learning_rate 3e-5 \
25 | --lora_rank 64 \
26 | --lora_alpha 128 \
27 | --use_tensorboard ${DATA_DIR}/logs/${exp_name} \
28 |
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # unit_test_rl
2 | Project code for training LLMs to write better unit tests + code
3 |
4 | Warning: This is experiment code and isn't necessarily production ready or easy to use (E.g. There may be hardcoded paths to local data files). I'm just sharing some experiments that I thought were fun.
5 |
6 | You may need this fork of OpenRLHF, I made some modifications to support guided decoding and pass arguments to the reward model: https://github.com/rosmineb/OpenRLHF branch custom_changes_main
7 |
8 | To run:
9 |
10 | 1. Start the reward server, `python unit_tests_server/unit_test_reward_server.py --workers 8 --port 5432`
11 | 2. Call the run.sh experiment `source experiments/u6_7B_form/run.sh`
12 |
13 | These experiments support using a grammar for guided decoding. See u6_7B_gram experiment. You can check that a grammar parses an output with the following code (but a warning: using an SFT'd model is much better than using a grammar and potentially less work)
14 |
15 | `python unit_tests_server/check_grammar.py --grammar_file experiments/unit_test/grammar/unit_test_grammar.txt --string_file experiments/unit_test/grammar/sample_output.txt --model_id Qwen/Qwen2.5-Coder-7B-Instruct`
16 |
--------------------------------------------------------------------------------
/experiments/u1_7B/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_14B/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_14B/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_7B/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_colo/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_colo/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/eval_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/eval_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/humaneval"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "labels"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "humaneval"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_14B/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_7B/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_colo/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_14B/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_7B/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_colo/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/eval_pi_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/eval_pi_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_test"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | prompt_field: "prompt"
10 | label_field: "verification_info"
11 | debug: false
12 | reward_server_host: "0.0.0.0"
13 | reward_server_port: 5000
14 | response_begin_str: null
15 | code_start_str: "```python\n"
16 | code_end_str: "```"
17 | device_groups:
18 | - "cuda:0"
19 | - "cuda:1"
20 | - "cuda:2"
21 | - "cuda:3"
22 | - "cuda:4"
23 | - "cuda:5"
24 | code_format: "pi_verifiable"
25 | success_threshold: 0
26 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
27 | vllm_port: 8000
28 | max_new_tokens: 10240
29 | vllm_config:
30 | gpu_memory_utilization: 0.9
31 | max_model_len: 12288
32 | max_num_batched_tokens: 98304
33 | generation_num_copies: 1
34 | temperature: 0.5
35 | repetition_penalty: 0.0
36 | max_tokens: 12288
37 | tensor_parallel_size: 1
38 |
39 |
--------------------------------------------------------------------------------
/experiments/u1_14B/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u1_14B/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u1_7B/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u1_7B/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_colo/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/eval_pi_train_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step63_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u6_colo/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/eval_pi_train_50_config.yaml:
--------------------------------------------------------------------------------
1 | responses_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated"
2 | reward_output_dir: "/home/rosmine/data2/rl_codegen/datasets/generated_with_reward"
3 | dataset: "/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train"
4 | model: "/home/rosmine/data2/rl/x9_7B/checkpoint/global_step50_hf_merged"
5 | batch_size: 10
6 | max_tokens: 1024
7 | temperature: 0.7
8 | apply_chat_template: false
9 | max_num_samples: 252
10 | prompt_field: "prompt"
11 | label_field: "verification_info"
12 | debug: false
13 | reward_server_host: "0.0.0.0"
14 | reward_server_port: 5000
15 | response_begin_str: null
16 | code_start_str: "```python\n"
17 | code_end_str: "```"
18 | device_groups:
19 | - "cuda:0"
20 | - "cuda:1"
21 | - "cuda:2"
22 | - "cuda:3"
23 | - "cuda:4"
24 | - "cuda:5"
25 | code_format: "pi_verifiable"
26 | success_threshold: 0
27 | base_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
28 | vllm_port: 8000
29 | max_new_tokens: 10240
30 | vllm_config:
31 | gpu_memory_utilization: 0.9
32 | max_model_len: 12288
33 | max_num_batched_tokens: 98304
34 | generation_num_copies: 1
35 | temperature: 0.5
36 | repetition_penalty: 0.0
37 | max_tokens: 12288
38 | tensor_parallel_size: 1
39 |
40 |
--------------------------------------------------------------------------------
/unit_tests_server/convert_hf_to_verl_dataset.py:
--------------------------------------------------------------------------------
1 | # save as preprocess_mydata.py
2 | from datasets import load_from_disk
3 | import argparse, os, re
4 |
5 | def parse_args():
6 | parser = argparse.ArgumentParser(description="Convert HuggingFace dataset to veRL format")
7 | parser.add_argument("--dataset_path", required=True, help="Path to the HuggingFace dataset")
8 | parser.add_argument("--out_dir", required=True, help="Output directory for the converted dataset")
9 | return parser.parse_args()
10 |
11 | def extract_dataset_name(dataset_path):
12 | """
13 | Extract the dataset name from the dataset path.
14 | If the path is a file path, extract the filename without extension.
15 | If the path is a HuggingFace dataset ID, use the last part of the ID.
16 | """
17 | # Check if it's a file path
18 | if os.path.exists(dataset_path):
19 | return os.path.splitext(os.path.basename(dataset_path))[0]
20 |
21 | # Otherwise, assume it's a HuggingFace dataset ID (e.g., "myorg/mydata")
22 | return dataset_path.split('/')[-1]
23 |
24 | def main():
25 | args = parse_args()
26 |
27 | # Extract dataset name from the path
28 | dataset_name = extract_dataset_name(args.dataset_path)
29 |
30 | # Load the dataset
31 | ds = load_from_disk(args.dataset_path)
32 |
33 | # Create output directory
34 | os.makedirs(args.out_dir, exist_ok=True)
35 |
36 | # Process and save the dataset splits
37 |
38 | output_path = f"{args.out_dir}.parquet"
39 |
40 | ds.map(
41 | make_map_fn(args),
42 | with_indices=True
43 | ).to_parquet(output_path)
44 |
45 | print(f"Dataset converted and saved to {output_path}")
46 |
47 |
48 | # 1️⃣ optional helper – extract your “ground truth”
49 | def extract_gt(example):
50 | return example["label"] # adapt to your dataset
51 |
52 | # 2️⃣ build the mapping function veRL asks for
53 | def make_map_fn(args):
54 | DATA_SOURCE = extract_dataset_name(args.dataset_path)
55 | def _map(example, idx):
56 | prompt_txt = example["prompt"].strip()
57 | # if your template needs a system msg, add it here
58 | row = {
59 | "data_source": DATA_SOURCE,
60 | "prompt": [{"role": "user", "content": prompt_txt}],
61 | "ability": "open_ended",
62 | "reward_model": {
63 | "style": "rule",
64 | "ground_truth": example['verification_info']
65 | },
66 | "extra_info": {"gold_standard_solution": example['gold_standard_solution']}
67 | }
68 | return row
69 | return _map
70 |
71 | if __name__ == "__main__":
72 | main()
73 |
--------------------------------------------------------------------------------
/check_grammar.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import xgrammar as xgr
3 | from transformers import AutoTokenizer, AutoConfig
4 | import pdb
5 |
6 | def parse_arguments():
7 | """
8 | Parse command line arguments for grammar checking.
9 |
10 | Returns:
11 | argparse.Namespace: The parsed command line arguments.
12 | """
13 | parser = argparse.ArgumentParser(description="Check if a string conforms to a specified grammar.")
14 | parser.add_argument("--grammar_file", type=str, help="Path to the file containing the grammar definition.")
15 | parser.add_argument("--string_file", type=str, help="Path to the file containing the string to check against the grammar.")
16 | parser.add_argument("--model_id", type=str, default="meta-llama/Llama-3.2-1B-Instruct", help="Model ID to use for tokenization.")
17 |
18 | return parser.parse_args()
19 |
20 | if __name__ == "__main__":
21 | args = parse_arguments()
22 |
23 | with open(args.string_file, 'r') as f:
24 | candidate = f.read().strip()
25 | print(f"candidate: {candidate}")
26 |
27 | with open(args.grammar_file, 'r') as f:
28 | ebnf_source = f.read()
29 |
30 | print(f'Grammar: \n{ebnf_source}')
31 |
32 | tokenizer = AutoTokenizer.from_pretrained(args.model_id)
33 | full_vocab_size = AutoConfig.from_pretrained(args.model_id).vocab_size
34 | tok_info = xgr.TokenizerInfo.from_huggingface(tokenizer,
35 | vocab_size=full_vocab_size)
36 |
37 | compiler = xgr.GrammarCompiler(tok_info) #, allow_isolated_special=True)
38 | # three common options ⬇︎
39 | # compiled = compiler.compile_builtin_json_grammar() # built-in JSON
40 | # compiled = compiler.compile_json_schema(schema_str) # JSON schema
41 | compiled = compiler.compile_grammar(ebnf_source) # any EBNF text
42 |
43 | matcher = xgr.GrammarMatcher(compiled)
44 |
45 | token_ids = tokenizer.encode(candidate, add_special_tokens=False)
46 |
47 | ok = True
48 | good_up_to = []
49 | failed_token = None
50 | for tok in token_ids:
51 | if not matcher.accept_token(tok): # returns False = mismatch
52 | ok = False
53 | failed_token = tok
54 | break
55 | else:
56 | good_up_to.append(tok)
57 |
58 | # the string matches the grammar **iff**
59 | # • every token was accepted, and
60 | # • the matcher reached a terminal state.
61 | ok = ok and matcher.is_terminated() # end must be legal
62 | print("valid!" if ok else "invalid!")
63 | if not ok:
64 | print("parsed up to:")
65 | print(tokenizer.decode(good_up_to))
66 | print(f"failed token: {tokenizer.decode(failed_token)} {failed_token}")
67 |
68 | print(f"ok: {ok}")
--------------------------------------------------------------------------------
/unit_tests_server/check_grammar.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import xgrammar as xgr
3 | from transformers import AutoTokenizer, AutoConfig
4 | import pdb
5 |
6 | def parse_arguments():
7 | """
8 | Parse command line arguments for grammar checking.
9 |
10 | Returns:
11 | argparse.Namespace: The parsed command line arguments.
12 | """
13 | parser = argparse.ArgumentParser(description="Check if a string conforms to a specified grammar.")
14 | parser.add_argument("--grammar_file", type=str, help="Path to the file containing the grammar definition.")
15 | parser.add_argument("--string_file", type=str, help="Path to the file containing the string to check against the grammar.")
16 | parser.add_argument("--model_id", type=str, default="meta-llama/Llama-3.2-1B-Instruct", help="Model ID to use for tokenization.")
17 |
18 | return parser.parse_args()
19 |
20 | if __name__ == "__main__":
21 | args = parse_arguments()
22 |
23 | with open(args.string_file, 'r') as f:
24 | candidate = f.read().strip()
25 | print(f"candidate: {candidate}")
26 |
27 | with open(args.grammar_file, 'r') as f:
28 | ebnf_source = f.read()
29 |
30 | print(f'Grammar: \n{ebnf_source}')
31 |
32 | tokenizer = AutoTokenizer.from_pretrained(args.model_id)
33 | full_vocab_size = AutoConfig.from_pretrained(args.model_id).vocab_size
34 | tok_info = xgr.TokenizerInfo.from_huggingface(tokenizer,
35 | vocab_size=full_vocab_size)
36 |
37 | compiler = xgr.GrammarCompiler(tok_info) #, allow_isolated_special=True)
38 | # three common options ⬇︎
39 | # compiled = compiler.compile_builtin_json_grammar() # built-in JSON
40 | # compiled = compiler.compile_json_schema(schema_str) # JSON schema
41 | compiled = compiler.compile_grammar(ebnf_source) # any EBNF text
42 |
43 | matcher = xgr.GrammarMatcher(compiled)
44 |
45 | token_ids = tokenizer.encode(candidate, add_special_tokens=False)
46 |
47 | ok = True
48 | good_up_to = []
49 | failed_token = None
50 | for tok in token_ids:
51 | if not matcher.accept_token(tok): # returns False = mismatch
52 | ok = False
53 | failed_token = tok
54 | break
55 | else:
56 | good_up_to.append(tok)
57 |
58 | # the string matches the grammar **iff**
59 | # • every token was accepted, and
60 | # • the matcher reached a terminal state.
61 | ok = ok and matcher.is_terminated() # end must be legal
62 | print("valid!" if ok else "invalid!")
63 | if not ok:
64 | print("parsed up to:")
65 | print(tokenizer.decode(good_up_to))
66 | print(f"failed token: {tokenizer.decode(failed_token)} {failed_token}")
67 | pdb.set_trace()
68 |
69 | print(f"ok: {ok}")
--------------------------------------------------------------------------------
/unit_tests_server/test_unit_test.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import pdb
4 | from datasets import load_from_disk
5 | from transformers import AutoTokenizer
6 | import concurrent.futures
7 |
8 | def make_api_call(payload, url="http://0.0.0.0:5000/get_reward"):
9 | headers = {
10 | "Content-Type": "application/json"
11 | }
12 | response = requests.post(url, data=json.dumps(payload), headers=headers)
13 |
14 | if response.status_code == 200:
15 | response_json = response.json()
16 | print(f"API call successful. Response: {response_json}")
17 | return True
18 | else:
19 | print(f"API call failed with status code {response.status_code}. Response:")
20 | print(response.text)
21 | return False
22 |
23 | if __name__ == "__main__":
24 |
25 | n_queries = 1
26 | n_calls = 2
27 | n_calls_parallel = 8
28 |
29 | correct_code = r"""
30 | a = int(input())
31 | b = int(input())
32 | print(a + b)
33 | """
34 |
35 | incorrect_code = r"""
36 | a = int(input())
37 | b = int(input())
38 | print(a * b)
39 | """
40 |
41 | unit_test = r"""1
42 | 2
43 | """
44 |
45 | unit_test_correct_output = r"""3
46 | """
47 |
48 | unit_test_incorrect_output = r"""2
49 | """
50 |
51 | pairs = [
52 | (correct_code, unit_test, unit_test_correct_output), # 1
53 | # (incorrect_code, unit_test, unit_test_correct_output), # 0
54 | # (incorrect_code, unit_test, unit_test_incorrect_output), # -1
55 | ]
56 |
57 | labels = "{'test_cases': [{'type': 'stdin_stdout', 'input': '1\\n2', 'output': '3'}], " + "'reference_implementation': " + '"' + correct_code.replace("\n", "\\n") + '"}'
58 |
59 | for code, unit_test, unit_test_output in pairs:
60 | # code = code.replace("\n", "\\n")
61 | unit_tests = f"{unit_test}{unit_test_output}"
62 | messages = [
63 | {"role": "system", "content": "You are a helpful assistant."},
64 | {"role": "user", "content": "solve plz"},
65 | {"role": "assistant", "content": f"```python\n{code}```\n{unit_tests}"}
66 | ]
67 |
68 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct")
69 | inputs = tokenizer.apply_chat_template(messages, tokenize=False)
70 |
71 | payload = {
72 | "query": [inputs],
73 | "prompts": ["asdf"],
74 | "labels": [labels],
75 | "step": 2,
76 | "reward_config": {
77 | "n_steps": 200,
78 | "target_precision": 0.01,
79 | "warmup_steps": 10000,
80 | "max_time": 100000,
81 | "code_format":"pi_verifiable",
82 | "thinking_length_weight": 0.0
83 | }
84 | }
85 |
86 | url = "http://0.0.0.0:5432/get_reward"
87 | make_api_call(payload, url=url)
88 |
89 |
--------------------------------------------------------------------------------
/experiments/u2_7B/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u2_7B
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 3e-6 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--colocate_all_models \
82 | #--vllm_enable_sleep \
83 | #--deepspeed_enable_sleep \
84 | #--deepspeed_enable_super_sleep
85 |
86 | ray stop
87 | # --lora_alpha 128 \
88 | # --lora_rank 64
89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
90 | #--flash_attn \
91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
92 | #--vllm_sync_with_ray \
93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
94 |
--------------------------------------------------------------------------------
/experiments/u2_7B_3e-5/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u2_7B_3e-5
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 3e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--colocate_all_models \
82 | #--vllm_enable_sleep \
83 | #--deepspeed_enable_sleep \
84 | #--deepspeed_enable_super_sleep
85 |
86 | ray stop
87 | # --lora_alpha 128 \
88 | # --lora_rank 64
89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
90 | #--flash_attn \
91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
92 | #--vllm_sync_with_ray \
93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
94 |
--------------------------------------------------------------------------------
/experiments/u2_7B_base_sft/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u2_7B_bs
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain /home/rosmine/data2/rl/u3_sft_1e-5_vllm/ \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--colocate_all_models \
82 | #--vllm_enable_sleep \
83 | #--deepspeed_enable_sleep \
84 | #--deepspeed_enable_super_sleep
85 |
86 | ray stop
87 | # --lora_alpha 128 \
88 | # --lora_rank 64
89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
90 | #--flash_attn \
91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
92 | #--vllm_sync_with_ray \
93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
94 |
--------------------------------------------------------------------------------
/experiments/u2_7B_few_shot/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u2_7B_fs
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 3e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_few_shot \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--colocate_all_models \
82 | #--vllm_enable_sleep \
83 | #--deepspeed_enable_sleep \
84 | #--deepspeed_enable_super_sleep
85 |
86 | ray stop
87 | # --lora_alpha 128 \
88 | # --lora_rank 64
89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
90 | #--flash_attn \
91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
92 | #--vllm_sync_with_ray \
93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
94 |
--------------------------------------------------------------------------------
/experiments/u1_7B/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u1_7B
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 4 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 4e-6 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train_with_unit_tests_v1 \
42 | --input_key prompt \
43 | --label_key verification_info \
44 | --max_samples 100000 \
45 | --normalize_reward \
46 | --load_checkpoint \
47 | --advantage_estimator group_norm \
48 | --remote_rm_url http://localhost:5432/get_reward \
49 | --use_tensorboard logs/${exp_name} \
50 | --vllm_sync_backend nccl \
51 | --enforce_eager \
52 | --save_hf_ckpt \
53 | --disable_ds_ckpt \
54 | --RM_CONFIG_save_threshold 60 \
55 | --RM_CONFIG_n_steps 100 \
56 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
57 | --RM_CONFIG_aux_decay 1 \
58 | --RM_CONFIG_n_trials 10 \
59 | --RM_CONFIG_dfg_complexity_weight 0.5 \
60 | --RM_CONFIG_code_bleu_weight 0.5 \
61 | --RM_CONFIG_timeout_seconds 120 \
62 | --RM_CONFIG_aux_coef 0.0 \
63 | --RM_CONFIG_aux_coef_warmup 0 \
64 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
65 | --RM_CONFIG_batch_size 128 \
66 | --RM_CONFIG_use_input_format_reward true \
67 | --RM_CONFIG_code_format "pi_verifiable" \
68 | --RM_CONFIG_max_time 10.0 \
69 | --RM_CONFIG_response_begin_str "Please make sure the code is efficient." \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--colocate_all_models \
82 | #--vllm_enable_sleep \
83 | #--deepspeed_enable_sleep \
84 | #--deepspeed_enable_super_sleep
85 |
86 | ray stop
87 | # --lora_alpha 128 \
88 | # --lora_rank 64
89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
90 | #--flash_attn \
91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
92 | #--vllm_sync_with_ray \
93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
94 |
--------------------------------------------------------------------------------
/experiments/u1_14B/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u1_14B
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 2 \
19 | --vllm_tensor_parallel_size 1 \
20 | --vllm_gpu_memory_utilization 0.90 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-14B-Instruct \
24 | --save_steps 10 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 1 \
28 | --train_batch_size 48 \
29 | --micro_rollout_batch_size 2 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 3e-6 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_train_with_unit_tests_v1 \
42 | --input_key prompt \
43 | --label_key verification_info \
44 | --max_samples 100000 \
45 | --normalize_reward \
46 | --load_checkpoint \
47 | --advantage_estimator group_norm \
48 | --remote_rm_url http://localhost:5432/get_reward \
49 | --use_tensorboard logs/${exp_name} \
50 | --vllm_sync_backend nccl \
51 | --enforce_eager \
52 | --save_hf_ckpt \
53 | --disable_ds_ckpt \
54 | --RM_CONFIG_save_threshold 60 \
55 | --RM_CONFIG_n_steps 100 \
56 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
57 | --RM_CONFIG_aux_decay 1 \
58 | --RM_CONFIG_n_trials 10 \
59 | --RM_CONFIG_dfg_complexity_weight 0.5 \
60 | --RM_CONFIG_code_bleu_weight 0.5 \
61 | --RM_CONFIG_timeout_seconds 120 \
62 | --RM_CONFIG_aux_coef 0.0 \
63 | --RM_CONFIG_aux_coef_warmup 0 \
64 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
65 | --RM_CONFIG_batch_size 128 \
66 | --RM_CONFIG_use_input_format_reward true \
67 | --RM_CONFIG_code_format "pi_verifiable" \
68 | --RM_CONFIG_max_time 10.0 \
69 | --RM_CONFIG_response_begin_str "Please make sure the code is efficient." \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--colocate_all_models \
82 | #--vllm_enable_sleep \
83 | #--deepspeed_enable_sleep \
84 | #--deepspeed_enable_super_sleep
85 |
86 | ray stop
87 | # --lora_alpha 128 \
88 | # --lora_rank 64
89 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
90 | #--flash_attn \
91 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
92 | #--vllm_sync_with_ray \
93 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
94 |
--------------------------------------------------------------------------------
/experiments/u6_7B_sft/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u6_7B_sft
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain /home/rosmine/data2/rl/u3_sft_1e-5_vllm/ \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --early_stopping_steps 62 \
56 | --RM_CONFIG_save_threshold 60 \
57 | --RM_CONFIG_n_steps 100 \
58 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
59 | --RM_CONFIG_aux_decay 1 \
60 | --RM_CONFIG_n_trials 10 \
61 | --RM_CONFIG_dfg_complexity_weight 0.5 \
62 | --RM_CONFIG_code_bleu_weight 0.5 \
63 | --RM_CONFIG_timeout_seconds 120 \
64 | --RM_CONFIG_aux_coef 0.0 \
65 | --RM_CONFIG_aux_coef_warmup 0 \
66 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
67 | --RM_CONFIG_batch_size 128 \
68 | --RM_CONFIG_use_input_format_reward true \
69 | --RM_CONFIG_code_format "pi_verifiable" \
70 | --RM_CONFIG_max_time 10.0 \
71 | --RM_CONFIG_thinking_length_weight 0 \
72 | --entropy_coef 0.0 \
73 | --num_episodes 1 \
74 | --outlier_reward_filter -10.0 \
75 | --ring_attn_size 4 \
76 | --lora_alpha 128 \
77 | --lora_rank 64 \
78 | --packing_samples \
79 | --flash_attn \
80 | --adam_offload \
81 | --lr_warmup_ratio 0.001 \
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u6_7B_fs/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u6_7B_fs
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples_few_shot_with_reference \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --early_stopping_steps 62 \
56 | --RM_CONFIG_save_threshold 60 \
57 | --RM_CONFIG_n_steps 100 \
58 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
59 | --RM_CONFIG_aux_decay 1 \
60 | --RM_CONFIG_n_trials 10 \
61 | --RM_CONFIG_dfg_complexity_weight 0.5 \
62 | --RM_CONFIG_code_bleu_weight 0.5 \
63 | --RM_CONFIG_timeout_seconds 120 \
64 | --RM_CONFIG_aux_coef 0.0 \
65 | --RM_CONFIG_aux_coef_warmup 0 \
66 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
67 | --RM_CONFIG_batch_size 128 \
68 | --RM_CONFIG_use_input_format_reward true \
69 | --RM_CONFIG_code_format "pi_verifiable" \
70 | --RM_CONFIG_max_time 10.0 \
71 | --RM_CONFIG_thinking_length_weight 0 \
72 | --entropy_coef 0.0 \
73 | --num_episodes 1 \
74 | --outlier_reward_filter -10.0 \
75 | --ring_attn_size 4 \
76 | --lora_alpha 128 \
77 | --lora_rank 64 \
78 | --packing_samples \
79 | --flash_attn \
80 | --adam_offload \
81 | --lr_warmup_ratio 0.001 \
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u2_7B_gram/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u2_7B_gram
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u4_7B_gram/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u4_7B_gram
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u5_7B_gram/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u5_7B_gram
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u6_colo/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u6_colo
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 6 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 6 \
18 | --vllm_num_engines 6 \
19 | --vllm_tensor_parallel_size 1 \
20 | --vllm_gpu_memory_utilization 0.7 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 12 \
29 | --micro_rollout_batch_size 6 \
30 | --rollout_batch_size 6 \
31 | --n_samples_per_prompt 2 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 1024 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | --colocate_all_models \
82 | --vllm_enable_sleep \
83 | --deepspeed_enable_sleep \
84 | #--grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u6_7B_form/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u6_7B_form
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | #--grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/experiments/u6_7B_gram/run.sh:
--------------------------------------------------------------------------------
1 | exp_name=u6_7B_gram
2 |
3 | ray start --head --port=6380 --node-ip-address=127.0.0.1 --temp-dir=/home/rosmine/data2/rl/${exp_name}
4 |
5 | DATA_DIR=/home/rosmine/data2/rl/${exp_name}
6 |
7 | echo "ray start"
8 |
9 | OPENRLHF_DIR=/home/rosmine/projects/OpenRLHF
10 |
11 | ray job submit --address="http://127.0.0.1:8265" \
12 | --runtime-env-json='{"working_dir": "'${OPENRLHF_DIR}'"}' \
13 | -- python3 -m openrlhf.cli.train_ppo_ray \
14 | --ref_num_nodes 1 \
15 | --ref_num_gpus_per_node 4 \
16 | --actor_num_nodes 1 \
17 | --actor_num_gpus_per_node 4 \
18 | --vllm_num_engines 1 \
19 | --vllm_tensor_parallel_size 2 \
20 | --vllm_gpu_memory_utilization 0.95 \
21 | --save_path ${DATA_DIR}/save_path \
22 | --ckpt_path ${DATA_DIR}/checkpoint \
23 | --pretrain Qwen/Qwen2.5-Coder-7B-Instruct \
24 | --save_steps 1 \
25 | --logging_steps 1 \
26 | --eval_steps -1 \
27 | --micro_train_batch_size 2 \
28 | --train_batch_size 16 \
29 | --micro_rollout_batch_size 4 \
30 | --rollout_batch_size 32 \
31 | --n_samples_per_prompt 16 \
32 | --rm_batch_size 1 \
33 | --max_epochs 2 \
34 | --prompt_max_len 2048 \
35 | --generate_max_len 2048 \
36 | --zero_stage 2 \
37 | --bf16 \
38 | --actor_learning_rate 1e-5 \
39 | --gradient_checkpointing \
40 | --init_kl_coef 0.0 \
41 | --apply_chat_template \
42 | --prompt_data /home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples \
43 | --input_key prompt \
44 | --label_key verification_info \
45 | --max_samples 100000 \
46 | --normalize_reward \
47 | --load_checkpoint \
48 | --advantage_estimator group_norm \
49 | --remote_rm_url http://localhost:5432/get_reward \
50 | --use_tensorboard logs/${exp_name} \
51 | --vllm_sync_backend nccl \
52 | --enforce_eager \
53 | --save_hf_ckpt \
54 | --disable_ds_ckpt \
55 | --RM_CONFIG_save_threshold 60 \
56 | --RM_CONFIG_n_steps 100 \
57 | --RM_CONFIG_save_file /home/rosmine/data2/ray/${exp_name}/optimizer_code_output \
58 | --RM_CONFIG_aux_decay 1 \
59 | --RM_CONFIG_n_trials 10 \
60 | --RM_CONFIG_dfg_complexity_weight 0.5 \
61 | --RM_CONFIG_code_bleu_weight 0.5 \
62 | --RM_CONFIG_timeout_seconds 120 \
63 | --RM_CONFIG_aux_coef 0.0 \
64 | --RM_CONFIG_aux_coef_warmup 0 \
65 | --RM_CONFIG_useful_line_ratio_coef 0.0 \
66 | --RM_CONFIG_batch_size 128 \
67 | --RM_CONFIG_use_input_format_reward true \
68 | --RM_CONFIG_code_format "pi_verifiable" \
69 | --RM_CONFIG_max_time 10.0 \
70 | --RM_CONFIG_thinking_length_weight 0 \
71 | --entropy_coef 0.0 \
72 | --num_episodes 1 \
73 | --outlier_reward_filter -10.0 \
74 | --ring_attn_size 4 \
75 | --lora_alpha 128 \
76 | --lora_rank 64 \
77 | --packing_samples \
78 | --flash_attn \
79 | --adam_offload \
80 | --lr_warmup_ratio 0.001 \
81 | --grammar_file /home/rosmine/projects/rl_codegen/experiments/unit_test/grammar/unit_test_grammar.txt
82 | #--colocate_all_models \
83 | #--vllm_enable_sleep \
84 | #--deepspeed_enable_sleep \
85 | #--deepspeed_enable_super_sleep
86 |
87 | ray stop
88 | # --lora_alpha 128 \
89 | # --lora_rank 64
90 | #--pretrain $(pwd)/checkpoint/7B_sft_v2 \
91 | #--flash_attn \
92 | #--pretrain deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \
93 | #--vllm_sync_with_ray \
94 | #--pretrain Qwen/Qwen2.5-Coder-0.5B-Instruct \
95 |
--------------------------------------------------------------------------------
/unit_tests_server/verl_unit.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import pdb
4 | from datasets import load_from_disk
5 | from transformers import AutoTokenizer
6 | import concurrent.futures
7 | import pdb
8 |
9 | def hf_reward_fn(prompts, completions, **kwargs):
10 | # print(prompts, completions, kwargs)
11 | verification_info = kwargs["verification_info"]
12 | rewards = []
13 | # Create payloads for all completions
14 | payloads = []
15 | for i in range(len(completions)):
16 | payload = {
17 | "query": [completions[i]],
18 | "prompts": [prompts[i]],
19 | "labels": [verification_info[i]],
20 | "step": 2,
21 | "extra_info": kwargs,
22 | "reward_config": {"n_steps": 200,
23 | "target_precision": 0.01,
24 | "warmup_steps": 10000,
25 | "max_time": 5,
26 | "code_format":"pi_verifiable",
27 | "thinking_length_weight": 0.0,
28 | "use_input_format_reward": True}
29 | }
30 | payloads.append(payload)
31 |
32 | url = "http://0.0.0.0:5431/get_reward"
33 |
34 | # Process requests in parallel
35 | def process_request(payload):
36 | result = make_api_call(payload, url=url)
37 | return result['rewards'][0]
38 |
39 | with concurrent.futures.ThreadPoolExecutor() as executor:
40 | results = list(executor.map(process_request, payloads))
41 |
42 | rewards.extend(results)
43 | print(f"returning rewards: {rewards}")
44 | return rewards
45 |
46 | def reward_fn(data_source, solution_str, ground_truth, extra_info=None):
47 | """
48 | data_source: str
49 | solution_str: str
50 | ground_truth: str
51 | extra_info: dict
52 | """
53 | payload = {
54 | "query": [solution_str],
55 | "prompts": [data_source],
56 | "labels": [ground_truth],
57 | "step": 2,
58 | "extra_info": extra_info,
59 | "reward_config": {"n_steps": 200,
60 | "target_precision": 0.01,
61 | "warmup_steps": 10000,
62 | "max_time": 5,
63 | "code_format":"pi_verifiable",
64 | "thinking_length_weight": 0.0,
65 | "use_input_format_reward": True}
66 | }
67 |
68 | url = "http://0.0.0.0:5431/get_reward"
69 | result = make_api_call(payload, url=url)
70 | return result
71 |
72 |
73 | def make_api_call(payload, url="http://0.0.0.0:5000/get_reward"):
74 | headers = {
75 | "Content-Type": "application/json"
76 | }
77 | response = requests.post(url, data=json.dumps(payload), headers=headers)
78 |
79 | if response.status_code == 200:
80 | response_json = response.json()
81 | # print(f"API call successful. Response: {response_json}")
82 | return response_json
83 | else:
84 | print(f"API call failed with status code {response.status_code}. Response:")
85 | print(response.text)
86 | return False
87 |
88 | if __name__ == "__main__":
89 |
90 | n_queries = 1
91 | n_calls = 2
92 | n_calls_parallel = 8
93 |
94 | correct_code = r"""
95 | a = int(input())
96 | b = int(input())
97 | print(a + b)
98 | """
99 |
100 | incorrect_code = r"""
101 | a = int(input())
102 | b = int(input())
103 | print(a * b)
104 | """
105 |
106 | unit_test = r"""1
107 | 2
108 | """
109 |
110 | unit_test_correct_output = r"""3
111 | """
112 |
113 | unit_test_incorrect_output = r"""2
114 | """
115 |
116 | pairs = [
117 | # (correct_code, unit_test, unit_test_correct_output), # 1
118 | # (incorrect_code, unit_test, unit_test_correct_output), # 0
119 | (incorrect_code, unit_test, unit_test_incorrect_output), # -1
120 | ]
121 |
122 | labels = "{'test_cases': [{'type': 'stdin_stdout', 'input': '1\n2', 'output': '3'}]}"
123 |
124 | for code, unit_test, unit_test_output in pairs:
125 | unit_tests = f"{unit_test}{unit_test_output}"
126 | messages = [
127 | {"role": "system", "content": "You are a helpful assistant."},
128 | {"role": "user", "content": "solve plz"},
129 | {"role": "assistant", "content": f"{code}\n{unit_tests}"}
130 | ]
131 |
132 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-0.5B-Instruct")
133 | inputs = tokenizer.apply_chat_template(messages, tokenize=False)
134 |
135 | payload = {
136 | "query": [inputs],
137 | "prompts": ["asdf"],
138 | "labels": [labels],
139 | "step": 2,
140 | "reward_config": {
141 | "n_steps": 200,
142 | "target_precision": 0.01,
143 | "warmup_steps": 10000,
144 | "max_time": 100000,
145 | "code_format":"pi_verifiable",
146 | "thinking_length_weight": 0.0
147 | }
148 | }
149 |
150 | url = "http://0.0.0.0:5432/get_reward"
151 | make_api_call(payload, url=url)
152 |
153 |
--------------------------------------------------------------------------------
/unit_tests_server/train_unsloth.py:
--------------------------------------------------------------------------------
1 | from unsloth import FastLanguageModel
2 | import torch
3 | import yaml
4 | from datasets import load_dataset, load_from_disk
5 | from trl import GRPOConfig, GRPOTrainer
6 |
7 | import argparse
8 |
9 | import os
10 | import importlib.util
11 | import sys
12 | import pdb
13 |
14 | from unsloth_train_patch import train
15 |
16 | def load_and_call_function(file_path, function_name, *args, **kwargs):
17 | """
18 | Loads a Python file from the given path and calls the specified function.
19 |
20 | Args:
21 | file_path (str): Path to the Python file
22 | function_name (str): Name of the function to call
23 | *args: Positional arguments to pass to the function
24 | **kwargs: Keyword arguments to pass to the function
25 |
26 | Returns:
27 | The result of the function call
28 | """
29 | # Get the absolute path
30 | abs_path = os.path.abspath(file_path)
31 |
32 | # Check if file exists
33 | if not os.path.exists(abs_path):
34 | raise FileNotFoundError(f"File not found: {abs_path}")
35 |
36 | # Get the module name from the file path
37 | module_name = os.path.splitext(os.path.basename(abs_path))[0]
38 |
39 | # Load the module specification
40 | spec = importlib.util.spec_from_file_location(module_name, abs_path)
41 | if spec is None:
42 | raise ImportError(f"Could not load spec for module at {abs_path}")
43 |
44 | # Create the module
45 | module = importlib.util.module_from_spec(spec)
46 |
47 | # Add the module to sys.modules
48 | sys.modules[module_name] = module
49 |
50 | # Execute the module
51 | spec.loader.exec_module(module)
52 |
53 | # Check if the function exists in the module
54 | if not hasattr(module, function_name):
55 | raise AttributeError(f"Function '{function_name}' not found in {abs_path}")
56 |
57 | # Get the function
58 | function = getattr(module, function_name)
59 |
60 | # Call the function with the provided arguments
61 | return function
62 |
63 | def parse_args():
64 | parser = argparse.ArgumentParser(description="Training script for Unsloth model")
65 | parser.add_argument("--config", '-c', type=str, help="Path to configuration file")
66 | return parser.parse_args()
67 |
68 | def main(args: argparse.Namespace):
69 |
70 |
71 | model, tokenizer = FastLanguageModel.from_pretrained(
72 | model_name=args.model_name,
73 | max_seq_length=args.max_seq_length,
74 | load_in_4bit=args.load_in_4bit, # False for LoRA 16bit
75 | load_in_8bit=args.load_in_8bit, # False for LoRA 16bit
76 | fast_inference=True, # Enable vLLM fast inference
77 | max_lora_rank=args.lora_rank,
78 | gpu_memory_utilization=0.6, # Reduce if out of memory
79 | )
80 |
81 | model = FastLanguageModel.get_peft_model(
82 | model,
83 | r=args.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
84 | target_modules=[
85 | "q_proj",
86 | "k_proj",
87 | "v_proj",
88 | "o_proj",
89 | "gate_proj",
90 | "up_proj",
91 | "down_proj",
92 | ], # Remove QKVO if out of memory
93 | lora_alpha=args.lora_alpha,
94 | use_gradient_checkpointing="unsloth", # Enable long context finetuning
95 | random_state=3407,
96 | )
97 |
98 | dataset = load_from_disk(args.dataset_path)
99 |
100 | training_args = GRPOConfig(
101 | learning_rate=args.learning_rate,
102 | optim="paged_adamw_8bit",
103 | logging_steps=1,
104 | per_device_train_batch_size=args.per_device_train_batch_size,
105 | gradient_accumulation_steps=args.gradient_accumulation_steps, # Increase to 4 for smoother training
106 | num_generations=args.num_generations, # Decrease if out of memory
107 | max_prompt_length=args.max_prompt_length,
108 | max_completion_length=args.max_seq_length - args.max_prompt_length,
109 | # num_train_epochs = 1, # Set to 1 for a full training run
110 | max_steps=args.max_steps,
111 | save_steps=args.save_steps,
112 | report_to="tensorboard", # Can use Weights & Biases
113 | output_dir=args.output_dir,
114 | use_vllm=True,
115 | beta=0.001,
116 | # generation_batch_size=args.generation_batch_size,
117 | # steps_per_generation=args.steps_per_generation,
118 | )
119 |
120 |
121 | # training_args.steps_per_generation = None
122 |
123 | reward_function = load_and_call_function(args.reward_function_path, args.reward_function_name)
124 |
125 | trainer = GRPOTrainer(
126 | model=model,
127 | processing_class=tokenizer,
128 | reward_funcs=[
129 | reward_function,
130 | ],
131 | args=training_args,
132 | train_dataset=dataset,
133 | )
134 |
135 | # train(trainer)
136 | trainer.train()
137 |
138 | model.save_pretrained(args.output_dir)
139 |
140 | if __name__ == "__main__":
141 | args = parse_args()
142 | with open(args.config, "r") as f:
143 | config = yaml.safe_load(f)
144 | for key, value in config.items():
145 | setattr(args, key, value)
146 | main(args)
--------------------------------------------------------------------------------
/unit_tests_server/convert_dataset_to_chat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "Map: 0%| | 0/6065 [00:00, ? examples/s]"
13 | ]
14 | },
15 | {
16 | "name": "stderr",
17 | "output_type": "stream",
18 | "text": [
19 | "Map: 100%|██████████| 6065/6065 [00:01<00:00, 3500.88 examples/s]\n",
20 | "Saving the dataset (3/3 shards): 100%|██████████| 6065/6065 [00:01<00:00, 4524.54 examples/s]\n"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "from datasets import load_from_disk\n",
26 | "\n",
27 | "dataset_name = \"/home/rosmine/data2/rl_codegen/datasets/pi_verifiable_no_fn_call_with_unit_tests_v2_train_with_reference_no_examples\"\n",
28 | "\n",
29 | "system_prompt = \"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\"\n",
30 | "\n",
31 | "dataset = load_from_disk(dataset_name)\n",
32 | "\n",
33 | "def convert_dataset_to_chat(example):\n",
34 | " example['prompt'] = [{'role': 'system', 'content': system_prompt},\n",
35 | " {'role': 'user', 'content': example['prompt']}]\n",
36 | " return example\n",
37 | "\n",
38 | "dataset = dataset.map(convert_dataset_to_chat)\n",
39 | "\n",
40 | "dataset.save_to_disk(dataset_name + \"_chat\")\n",
41 | "\n",
42 | "\n"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/plain": [
53 | "{'messages': [{'content': 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.',\n",
54 | " 'role': 'system'},\n",
55 | " {'content': \"Solve the following coding problem using the programming language python:\\n\\nGalois is one of the strongest chess players of Byteforces. He has even invented a new variant of chess, which he named «PawnChess».\\n\\nThis new game is played on a board consisting of 8 rows and 8 columns. At the beginning of every game some black and white pawns are placed on the board. The number of black pawns placed is not necessarily equal to the number of white pawns placed. \\n\\n\\n\\nLets enumerate rows and columns with integers from 1 to 8. Rows are numbered from top to bottom, while columns are numbered from left to right. Now we denote as (r, c) the cell located at the row r and at the column c.\\n\\nThere are always two players A and B playing the game. Player A plays with white pawns, while player B plays with black ones. The goal of player A is to put any of his pawns to the row 1, while player B tries to put any of his pawns to the row 8. As soon as any of the players completes his goal the game finishes immediately and the succeeded player is declared a winner.\\n\\nPlayer A moves first and then they alternate turns. On his move player A must choose exactly one white pawn and move it one step upward and player B (at his turn) must choose exactly one black pawn and move it one step down. Any move is possible only if the targeted cell is empty. It's guaranteed that for any scenario of the game there will always be at least one move available for any of the players.\\n\\nMoving upward means that the pawn located in (r, c) will go to the cell (r - 1, c), while moving down means the pawn located in (r, c) will go to the cell (r + 1, c). Again, the corresponding cell must be empty, i.e. not occupied by any other pawn of any color.\\n\\nGiven the initial disposition of the board, determine who wins the game if both players play optimally. Note that there will always be a winner due to the restriction that for any game scenario both players will have some moves available.\\n\\nInput\\n\\nThe input consists of the board description given in eight lines, each line contains eight characters. Character 'B' is used to denote a black pawn, and character 'W' represents a white pawn. Empty cell is marked with '.'. \\n\\nIt's guaranteed that there will not be white pawns on the first row neither black pawns on the last row.\\n\\nOutput\\n\\nPrint 'A' if player A wins the game on the given board, and 'B' if player B will claim the victory. Again, it's guaranteed that there will always be a winner on the given board.\\nNote\\n\\nIn the first sample player A is able to complete his goal in 3 steps by always moving a pawn initially located at (4, 5). Player B needs at least 5 steps for any of his pawns to reach the row 8. Hence, player A will be the winner.\\n\\nThe input will be given via stdin and the output should be printed to stdout by your code.\\n\\nNow solve the problem by providing the code.\\nReturn the code within ```python ... ``` markup. Write unit tests for the code with the standard input/output like this: (stdin)(stdout) tags. If you make multiple unit tests, use the ... ... tags to write multiple unit tests one after another.\",\n",
56 | " 'role': 'user'}]}"
57 | ]
58 | },
59 | "execution_count": 4,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "dataset[0]['prompt']"
66 | ]
67 | }
68 | ],
69 | "metadata": {
70 | "kernelspec": {
71 | "display_name": "transformers",
72 | "language": "python",
73 | "name": "python3"
74 | },
75 | "language_info": {
76 | "codemirror_mode": {
77 | "name": "ipython",
78 | "version": 3
79 | },
80 | "file_extension": ".py",
81 | "mimetype": "text/x-python",
82 | "name": "python",
83 | "nbconvert_exporter": "python",
84 | "pygments_lexer": "ipython3",
85 | "version": "3.11.0"
86 | }
87 | },
88 | "nbformat": 4,
89 | "nbformat_minor": 2
90 | }
91 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==2.1.0
2 | accelerate==1.3.0
3 | aiohappyeyeballs==2.6.1
4 | aiohttp==3.11.18
5 | aiohttp-cors==0.7.0
6 | aiosignal==1.3.2
7 | airportsdata==20241001
8 | annotated-types==0.7.0
9 | anyio==4.9.0
10 | argon2-cffi==23.1.0
11 | argon2-cffi-bindings==21.2.0
12 | arrow==1.3.0
13 | astor==0.8.1
14 | asttokens==3.0.0
15 | async-lru==2.0.4
16 | attrs==25.3.0
17 | babel==2.17.0
18 | beautifulsoup4==4.13.3
19 | bitsandbytes==0.45.2
20 | blake3==1.0.4
21 | bleach==6.2.0
22 | cachetools==5.5.1
23 | cairocffi==1.7.1
24 | CairoSVG==2.7.1
25 | certifi==2025.1.31
26 | cffi==1.17.1
27 | charset-normalizer==3.4.1
28 | click==8.1.8
29 | -e git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1#egg=clip
30 | cloudpickle==3.1.1
31 | cmake==4.0.0
32 | colorful==0.5.6
33 | comm==0.2.2
34 | compressed-tensors==0.9.1
35 | contourpy==1.3.1
36 | cssselect2==0.8.0
37 | cupy-cuda12x==13.3.0
38 | cycler==0.12.1
39 | datasets==3.5.0
40 | debugpy==1.8.12
41 | decorator==5.1.1
42 | deepspeed==0.16.7
43 | deepspeed-kernels==0.0.1.dev1698255861
44 | defusedxml==0.7.1
45 | Deprecated==1.2.18
46 | depyf==0.18.0
47 | dill==0.3.8
48 | diskcache==5.6.3
49 | distlib==0.3.9
50 | distro==1.9.0
51 | dnspython==2.7.0
52 | docker-pycreds==0.4.0
53 | einops==0.8.1
54 | email_validator==2.2.0
55 | executing==2.2.0
56 | fastapi==0.115.12
57 | fastapi-cli==0.0.7
58 | fastjsonschema==2.21.1
59 | fastrlock==0.8.3
60 | filelock==3.18.0
61 | flash-attn==2.7.0.post2
62 | fonttools==4.56.0
63 | fqdn==1.5.1
64 | frozenlist==1.6.0
65 | fsspec==2024.12.0
66 | ftfy==6.3.1
67 | gguf==0.10.0
68 | gitdb==4.0.12
69 | GitPython==3.1.44
70 | google-api-core==2.24.1
71 | google-auth==2.38.0
72 | googleapis-common-protos==1.67.0
73 | grpcio==1.70.0
74 | h11==0.14.0
75 | hf-xet==1.0.3
76 | hjson==3.1.0
77 | httpcore==1.0.8
78 | httptools==0.6.4
79 | httpx==0.28.1
80 | huggingface-hub==0.30.2
81 | idna==3.10
82 | importlib_metadata==8.0.0
83 | iniconfig==2.0.0
84 | interegular==0.3.3
85 | ipykernel==6.29.5
86 | ipython==8.32.0
87 | ipywidgets==8.1.5
88 | isoduration==20.11.0
89 | isort==6.0.0
90 | jedi==0.19.2
91 | Jinja2==3.1.6
92 | jiter==0.9.0
93 | joblib==1.4.2
94 | json5==0.10.0
95 | jsonlines==4.0.0
96 | jsonpointer==3.0.0
97 | jsonschema==4.23.0
98 | jsonschema-specifications==2025.4.1
99 | jupyter==1.1.1
100 | jupyter-console==6.6.3
101 | jupyter-events==0.12.0
102 | jupyter-lsp==2.2.5
103 | jupyter_client==8.6.3
104 | jupyter_core==5.7.2
105 | jupyter_server==2.15.0
106 | jupyter_server_terminals==0.5.3
107 | jupyterlab==4.3.5
108 | jupyterlab_pygments==0.3.0
109 | jupyterlab_server==2.27.3
110 | jupyterlab_widgets==3.0.13
111 | kiwisolver==1.4.8
112 | lark==1.2.2
113 | liger_kernel==0.5.6
114 | lightning-utilities==0.12.0
115 | llguidance==0.7.13
116 | llvmlite==0.43.0
117 | lm-format-enforcer==0.10.11
118 | loralib==0.1.2
119 | Markdown==3.7
120 | markdown-it-py==3.0.0
121 | MarkupSafe==3.0.2
122 | matplotlib==3.10.0
123 | matplotlib-inline==0.1.7
124 | mdurl==0.1.2
125 | mistral_common==1.5.4
126 | mistune==3.1.2
127 | mnist1d==0.0.2.post1
128 | mpi4py==4.0.2
129 | mpmath==1.3.0
130 | msgpack==1.1.0
131 | msgspec==0.19.0
132 | multidict==6.4.3
133 | multiprocess==0.70.16
134 | munkres==1.1.4
135 | nanobind==2.6.1
136 | nbclient==0.10.2
137 | nbconvert==7.16.6
138 | nbformat==5.10.4
139 | nest-asyncio==1.6.0
140 | networkx==3.4.2
141 | ninja==1.11.1.3
142 | nltk==3.9.1
143 | notebook==7.3.2
144 | notebook_shim==0.2.4
145 | numba==0.60.0
146 | numpy==1.26.4
147 | nvidia-cublas-cu11==11.11.3.6
148 | nvidia-cublas-cu12==12.4.5.8
149 | nvidia-cuda-cupti-cu11==11.8.87
150 | nvidia-cuda-cupti-cu12==12.4.127
151 | nvidia-cuda-nvrtc-cu11==11.8.89
152 | nvidia-cuda-nvrtc-cu12==12.4.127
153 | nvidia-cuda-runtime-cu11==11.8.89
154 | nvidia-cuda-runtime-cu12==12.4.127
155 | nvidia-cudnn-cu11==9.1.0.70
156 | nvidia-cudnn-cu12==9.1.0.70
157 | nvidia-cufft-cu11==10.9.0.58
158 | nvidia-cufft-cu12==11.2.1.3
159 | nvidia-cufile-cu12==1.11.1.6
160 | nvidia-curand-cu11==10.3.0.86
161 | nvidia-curand-cu12==10.3.5.147
162 | nvidia-cusolver-cu11==11.4.1.48
163 | nvidia-cusolver-cu12==11.6.1.9
164 | nvidia-cusparse-cu11==11.7.5.86
165 | nvidia-cusparse-cu12==12.3.1.170
166 | nvidia-cusparselt-cu12==0.6.2
167 | nvidia-ml-py==12.570.86
168 | nvidia-nccl-cu11==2.21.5
169 | nvidia-nccl-cu12==2.21.5
170 | nvidia-nvjitlink-cu12==12.4.127
171 | nvidia-nvtx-cu11==11.8.86
172 | nvidia-nvtx-cu12==12.4.127
173 | openai==1.76.0
174 | opencensus==0.11.4
175 | opencensus-context==0.1.3
176 | opencv-python-headless==4.11.0.86
177 | -e git+https://github.com/rosmineb/OpenRLHF.git@047bbfaa0d29063801720b961b64d068ae2ea7cd#egg=openrlhf
178 | opentelemetry-api==1.26.0
179 | opentelemetry-exporter-otlp==1.26.0
180 | opentelemetry-exporter-otlp-proto-common==1.26.0
181 | opentelemetry-exporter-otlp-proto-grpc==1.26.0
182 | opentelemetry-exporter-otlp-proto-http==1.26.0
183 | opentelemetry-proto==1.26.0
184 | opentelemetry-sdk==1.26.0
185 | opentelemetry-semantic-conventions==0.47b0
186 | opentelemetry-semantic-conventions-ai==0.4.3
187 | optimum==1.24.0
188 | outlines==0.1.11
189 | outlines_core==0.1.26
190 | overrides==7.7.0
191 | packaging==25.0
192 | pandas==2.2.3
193 | pandocfilters==1.5.1
194 | parso==0.8.4
195 | partial-json-parser==0.2.1.1.post5
196 | peft==0.14.0
197 | pexpect==4.9.0
198 | pillow==11.2.1
199 | platformdirs==4.3.6
200 | pluggy==1.5.0
201 | prometheus-fastapi-instrumentator==7.1.0
202 | prometheus_client==0.21.1
203 | prompt_toolkit==3.0.50
204 | propcache==0.3.1
205 | proto-plus==1.26.0
206 | protobuf==4.25.7
207 | psutil==7.0.0
208 | ptyprocess==0.7.0
209 | pure_eval==0.2.3
210 | py-cpuinfo==9.0.0
211 | py-spy==0.4.0
212 | pyairports==2.1.1
213 | pyarrow==19.0.1
214 | pyasn1==0.6.1
215 | pyasn1_modules==0.4.1
216 | pybind11==2.13.6
217 | pycountry==24.6.1
218 | pycparser==2.22
219 | pydantic==2.11.3
220 | pydantic_core==2.33.1
221 | Pygments==2.19.1
222 | pynvml==12.0.0
223 | pyparsing==3.2.1
224 | pytest==8.3.4
225 | python-dateutil==2.9.0.post0
226 | python-dotenv==1.1.0
227 | python-json-logger==3.2.1
228 | python-multipart==0.0.20
229 | pytz==2025.2
230 | PyYAML==6.0.2
231 | pyzmq==26.4.0
232 | RapidFuzz==3.12.1
233 | ray==2.40.0
234 | referencing==0.36.2
235 | regex==2024.11.6
236 | requests==2.32.3
237 | rfc3339-validator==0.1.4
238 | rfc3986-validator==0.1.1
239 | rich==13.9.4
240 | rich-toolkit==0.13.2
241 | rpds-py==0.24.0
242 | rsa==4.9
243 | safetensors==0.5.3
244 | scipy==1.15.2
245 | Send2Trash==1.8.3
246 | sentencepiece==0.2.0
247 | sentry-sdk==2.21.0
248 | setproctitle==1.3.4
249 | shellingham==1.5.4
250 | six==1.17.0
251 | smart-open==7.1.0
252 | smmap==5.0.2
253 | sniffio==1.3.1
254 | soupsieve==2.6
255 | stack-data==0.6.3
256 | starlette==0.46.2
257 | sympy==1.13.1
258 | tensorboard==2.19.0
259 | tensorboard-data-server==0.7.2
260 | terminado==0.18.1
261 | thefuzz==0.22.1
262 | tiktoken==0.9.0
263 | tinycss2==1.4.0
264 | tokenizers==0.21.1
265 | torch==2.5.1
266 | torchaudio==2.5.1
267 | torchmetrics==1.6.1
268 | torchvision==0.20.1
269 | tornado==6.4.2
270 | tqdm==4.67.1
271 | traitlets==5.14.3
272 | transformers==4.51.3
273 | transformers-stream-generator==0.0.5
274 | tree-sitter==0.24.0
275 | tree-sitter-python==0.23.6
276 | triton==3.1.0
277 | typer==0.15.1
278 | types-python-dateutil==2.9.0.20241206
279 | typing-inspection==0.4.0
280 | typing_extensions==4.13.2
281 | tzdata==2025.2
282 | uri-template==1.3.0
283 | urllib3==2.4.0
284 | uvicorn==0.34.2
285 | uvloop==0.21.0
286 | virtualenv==20.29.2
287 | vllm==0.7.3
288 | wandb==0.19.6
289 | watchfiles==1.0.5
290 | wcwidth==0.2.13
291 | webcolors==24.11.1
292 | webencodings==0.5.1
293 | websocket-client==1.8.0
294 | websockets==15.0.1
295 | Werkzeug==3.1.3
296 | widgetsnbextension==4.0.13
297 | wrapt==1.17.2
298 | xformers==0.0.28.post3
299 | xgrammar==0.1.11
300 | xxhash==3.5.0
301 | yarl==1.20.0
302 | zipp==3.21.0
303 |
--------------------------------------------------------------------------------
/unit_tests_server/unsloth_train_patch.py:
--------------------------------------------------------------------------------
1 | # stolen from ART https://github.com/OpenPipe/ART/blob/5f3dea20069ee8e4afbd482e529df5ee80d81b81/src/art/local/train.py
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.optim as optim
6 | from typing import Callable, Dict, List, Optional, Tuple, Union
7 |
8 | from trl import GRPOTrainer
9 | from peft import PeftModel
10 | import numpy as np
11 | import os
12 |
13 | import pdb
14 |
15 | torch_compile_options = {
16 | "epilogue_fusion" : True,
17 | "max_autotune" : False,
18 | "shape_padding" : True,
19 | "trace.enabled" : False,
20 | "triton.cudagraphs" : False,
21 | }
22 |
23 | def train(
24 | trainer: "GRPOTrainer",
25 | ) -> None:
26 | _compute_loss = trainer.compute_loss
27 | trainer.compute_loss = lambda *args, **kwargs: compute_loss(trainer, *args, **kwargs)
28 | # trainer.log = get_log_fn(trainer, results_queue)
29 | try:
30 | trainer.train()
31 | finally:
32 | trainer.compute_loss = _compute_loss
33 | # trainer.log = _log
34 |
35 | @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options)
36 | def grpo_compute_loss_slow(old_logits, new_logits, input_ids, mask, beta, advantages):
37 | # All Unsloth Zoo code licensed under LGPLv3
38 | input_ids = input_ids.unsqueeze(-1)
39 | new_logits = new_logits.to(torch.float32)
40 | new_x = torch.gather(new_logits, dim = -1, index = input_ids).squeeze(-1)
41 | new = new_x - torch.logsumexp(new_logits, dim = -1)
42 |
43 | if old_logits is None:
44 | kl_i = torch.zeros_like(mask)
45 | else:
46 | old_logits = old_logits.to(torch.float32)
47 |
48 | # x_i - logsumexp(x_i)
49 | old_x = torch.gather(old_logits, dim = -1, index = input_ids).squeeze(-1)
50 | old = old_x - torch.logsumexp(old_logits, dim = -1)
51 |
52 | # Reverse KL
53 | kl_i = torch.exp(old - new) - (old - new) - 1.0
54 | # Full correct reverse KL divergence?? Missing term maybe?
55 | # kl_i = torch.exp(new) * kl_i
56 |
57 | # Below is forward KL (normal KL)
58 | # kl_i = torch.exp(old) * (old - new)
59 |
60 | # Must detach - otherwise gradients are not propagated correctly!
61 | # exp(x - x) == 1
62 | loss_i = torch.exp(new - new.detach()) * advantages.unsqueeze(1)
63 | loss_i = -(loss_i - beta * kl_i)
64 |
65 | mask = mask.to(torch.float32)
66 | n_mask_per_reward = mask.sum(1)
67 |
68 | # See https://github.com/huggingface/trl/pull/2881
69 | loss_per_reward = (loss_i * mask).sum(1) / n_mask_per_reward
70 | loss = loss_per_reward.mean()
71 | # loss = (loss_i * mask).sum() / mask.sum()
72 |
73 | # Get metrics as well which are folded
74 | with torch.inference_mode():
75 | completion_length = n_mask_per_reward.mean()
76 | mean_kl_per_reward = (kl_i * mask).sum(1) / n_mask_per_reward
77 | mean_kl = mean_kl_per_reward.mean()
78 |
79 | return loss, completion_length,
80 |
81 | def grpo_accumulated_loss(
82 | trainer,
83 | input_ids,
84 | logits_to_keep,
85 | completion_mask,
86 | advantages,
87 | n_chunks = -1,
88 | ):
89 | # All Unsloth Zoo code licensed under LGPLv3
90 | bsz, qlen = input_ids.shape
91 | # Find closest multiple
92 | factors = [i for i in range(1, bsz + 1) if bsz % i == 0]
93 | if n_chunks == -1: n_chunks = bsz
94 | n_chunks = factors[min(np.searchsorted(factors, n_chunks), len(factors)-1)]
95 |
96 | mixed_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16
97 | os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
98 |
99 | completion_input_ids = input_ids[:, -logits_to_keep:]
100 | lm_head = trainer.model.get_output_embeddings().weight
101 |
102 | with torch.amp.autocast(device_type = "cuda", dtype = mixed_dtype):
103 | with torch.inference_mode(), trainer.accelerator.unwrap_model(trainer.model, keep_fp32_wrapper = False).disable_adapter():
104 | old_hidden_states = trainer.model(input_ids = input_ids, logits_to_keep = logits_to_keep + 1).logits
105 | pass
106 |
107 | new_hidden_states = trainer.model(input_ids = input_ids, logits_to_keep = logits_to_keep + 1).logits
108 |
109 | loss, completion_length, mean_kl = UnslothEfficientGRPO.apply(
110 | new_hidden_states, old_hidden_states, lm_head,
111 | completion_input_ids, completion_mask, advantages, trainer.beta,
112 | trainer.accelerator.scaler,
113 | n_chunks,
114 | )
115 | return loss, completion_length, mean_kl
116 |
117 | # Old non efficient code path
118 | new_logits = torch.matmul(new_hidden_states, lm_head.t())
119 | new_logits = new_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred
120 | old_logits = torch.matmul(old_hidden_states, lm_head.t())
121 | old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred
122 | loss, completion_length, mean_kl = grpo_compute_loss(
123 | old_logits, new_logits, completion_input_ids, completion_mask, trainer.beta, advantages,
124 | )
125 | return loss, completion_length, mean_kl
126 | pass
127 |
128 | def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None):
129 | if return_outputs:
130 | raise ValueError("The GRPOTrainer does not support returning outputs")
131 | # Compute the per-token log probabilities for the model
132 |
133 | prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
134 | completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
135 | input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
136 | bsz, qlen = input_ids.shape
137 | attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
138 | # attention_mask = None
139 | logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens
140 | _input_ids = input_ids
141 | _logits_to_keep = logits_to_keep
142 |
143 | per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)
144 |
145 | # Compute the KL divergence between the model and the reference model
146 | ref_per_token_logps = inputs["ref_per_token_logps"] if "ref_per_token_logps" in inputs else None
147 | # per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
148 |
149 | # x - x.detach() allows for preserving gradients from x
150 | advantages = inputs["advantages"]
151 | # per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1)
152 | # per_token_loss = -(per_token_loss - self.beta * per_token_kl)
153 | # loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
154 | input_ids = input_ids[:, -logits_to_keep:]
155 | if per_token_logps is not None:
156 | loss, completion_length, mean_kl = grpo_compute_loss_slow(
157 | ref_per_token_logps, per_token_logps, input_ids, completion_mask, self.beta, advantages,
158 | )
159 | else:
160 | loss, completion_length, mean_kl = grpo_accumulated_loss(
161 | self, _input_ids, logits_to_keep, completion_mask, advantages,
162 | n_chunks = self.args.unsloth_num_chunks,
163 | )
164 |
165 | # Log the metrics
166 | # completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
167 |
168 | # mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
169 | # self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
170 |
171 | if "train" in self._metrics:
172 | mode = "eval" if self.control.should_evaluate else "train"
173 | self._metrics[mode]["completion_length"].append(completion_length.item())
174 | self._metrics[mode]["kl"].append(mean_kl.item())
175 | else:
176 | self._metrics["completion_length"].append(completion_length.item())
177 | self._metrics["kl"].append(mean_kl.item())
178 | return loss
179 |
--------------------------------------------------------------------------------