├── .gitignore
├── LICENSE
├── README.md
├── assets
├── costs-1.5b.png
├── costs-7b.png
├── overall.png
└── performances.png
├── eval.sh
├── logs
├── evals
│ ├── Exp1_100
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-14T23-12-53.542208.json
│ │ │ ├── results_2025-03-14T23-36-50.272791.json
│ │ │ ├── results_2025-03-14T23-48-54.788605.json
│ │ │ ├── results_2025-03-15T13-49-09.435323.json
│ │ │ ├── results_2025-03-15T14-13-14.418707.json
│ │ │ └── results_2025-03-15T14-25-17.786800.json
│ ├── Exp1_200
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T00-02-07.142924.json
│ │ │ ├── results_2025-03-15T00-35-55.755190.json
│ │ │ ├── results_2025-03-15T00-48-41.771959.json
│ │ │ └── results_2025-03-15T14-37-59.982413.json
│ ├── Exp1_300
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T01-02-29.088363.json
│ │ │ ├── results_2025-03-15T01-51-01.120880.json
│ │ │ └── results_2025-03-15T02-04-41.905474.json
│ ├── Exp1_400
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T02-14-11.184777.json
│ │ │ ├── results_2025-03-15T02-36-02.705517.json
│ │ │ └── results_2025-03-15T02-44-26.970544.json
│ ├── Exp1_500
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T02-52-04.589810.json
│ │ │ ├── results_2025-03-15T03-11-26.478898.json
│ │ │ └── results_2025-03-15T03-18-43.114223.json
│ ├── Exp2_100
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T04-13-19.506547.json
│ │ │ ├── results_2025-03-15T04-35-14.751095.json
│ │ │ └── results_2025-03-15T04-44-39.651229.json
│ ├── Exp2_150
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T04-54-03.406254.json
│ │ │ ├── results_2025-03-15T05-13-57.844846.json
│ │ │ └── results_2025-03-15T05-20-07.618719.json
│ ├── Exp2_200
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T05-31-02.871828.json
│ │ │ ├── results_2025-03-15T05-51-04.119358.json
│ │ │ └── results_2025-03-15T06-00-23.069698.json
│ ├── Exp2_250
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T06-09-49.021006.json
│ │ │ ├── results_2025-03-15T06-28-55.702973.json
│ │ │ └── results_2025-03-15T06-34-45.013493.json
│ ├── Exp2_300
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T06-42-52.541003.json
│ │ │ ├── results_2025-03-15T06-56-52.023106.json
│ │ │ └── results_2025-03-15T07-04-05.239254.json
│ ├── Exp2_350
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T07-10-27.028672.json
│ │ │ ├── results_2025-03-15T07-24-32.807123.json
│ │ │ └── results_2025-03-15T07-29-47.478300.json
│ ├── Exp2_400
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T07-36-30.994685.json
│ │ │ ├── results_2025-03-15T07-49-32.062451.json
│ │ │ └── results_2025-03-15T07-55-51.416032.json
│ ├── Exp2_50
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T03-30-59.655917.json
│ │ │ ├── results_2025-03-15T03-53-53.011460.json
│ │ │ └── results_2025-03-15T04-01-54.627135.json
│ ├── Exp3_100
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T08-52-22.004802.json
│ │ │ ├── results_2025-03-15T09-16-52.530386.json
│ │ │ └── results_2025-03-15T09-26-36.766492.json
│ ├── Exp3_150
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T09-36-59.825027.json
│ │ │ ├── results_2025-03-15T09-55-14.957560.json
│ │ │ └── results_2025-03-15T10-04-33.528650.json
│ ├── Exp3_200
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T10-15-17.421583.json
│ │ │ ├── results_2025-03-15T10-33-42.826955.json
│ │ │ └── results_2025-03-15T10-39-54.446838.json
│ ├── Exp3_250
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T10-50-49.693635.json
│ │ │ ├── results_2025-03-15T11-11-55.345666.json
│ │ │ └── results_2025-03-15T11-22-45.088452.json
│ ├── Exp3_300
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T11-33-37.815028.json
│ │ │ ├── results_2025-03-15T11-56-26.434504.json
│ │ │ └── results_2025-03-15T12-07-33.637126.json
│ ├── Exp3_350
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T12-19-09.426338.json
│ │ │ ├── results_2025-03-15T12-45-32.618178.json
│ │ │ └── results_2025-03-15T12-54-08.288591.json
│ ├── Exp3_400
│ │ └── results
│ │ │ └── quyanh
│ │ │ └── OpenRS-GRPO
│ │ │ ├── results_2025-03-15T13-02-26.672622.json
│ │ │ ├── results_2025-03-15T13-24-00.237541.json
│ │ │ └── results_2025-03-15T13-32-38.296980.json
│ └── Exp3_50
│ │ └── results
│ │ └── quyanh
│ │ └── OpenRS-GRPO
│ │ ├── results_2025-03-15T08-07-05.491769.json
│ │ ├── results_2025-03-15T08-31-16.022306.json
│ │ └── results_2025-03-15T08-41-29.644364.json
├── logs.txt
└── overall
│ ├── Exp1_100
│ └── results
│ │ └── quyanh
│ │ └── OpenRS-GRPO
│ │ ├── results_2025-03-15T18-04-10.796422.json
│ │ ├── results_2025-03-15T18-27-09.526559.json
│ │ ├── results_2025-03-15T18-37-46.790845.json
│ │ ├── results_2025-03-15T18-56-57.793761.json
│ │ └── results_2025-03-15T20-14-03.383896.json
│ ├── Exp2_50
│ └── results
│ │ └── quyanh
│ │ └── OpenRS-GRPO
│ │ ├── results_2025-03-15T20-26-02.445125.json
│ │ ├── results_2025-03-15T20-48-43.218853.json
│ │ ├── results_2025-03-15T20-56-24.803840.json
│ │ ├── results_2025-03-15T21-15-57.124823.json
│ │ └── results_2025-03-15T22-27-30.404581.json
│ └── Exp3_50
│ └── results
│ └── quyanh
│ └── OpenRS-GRPO
│ ├── results_2025-03-15T22-38-52.294521.json
│ ├── results_2025-03-15T23-02-19.308882.json
│ ├── results_2025-03-15T23-12-28.069489.json
│ ├── results_2025-03-15T23-32-02.283638.json
│ └── results_2025-03-16T00-44-02.504832.json
├── recipes
├── accelerate_configs
│ ├── ddp.yaml
│ ├── fsdp.yaml
│ ├── zero2.yaml
│ └── zero3.yaml
├── data_cleaner.yaml
└── grpo.yaml
├── setup.cfg
├── setup.py
└── src
└── open_r1
├── __init__.py
├── configs.py
├── evaluate.py
├── generate.py
├── grpo.py
├── rewards.py
├── sft.py
└── utils
├── __init__.py
├── callbacks.py
├── evaluation.py
├── hub.py
├── import_utils.py
├── model_utils.py
└── wandb_logging.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Knovel Engineering
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/assets/costs-1.5b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/costs-1.5b.png
--------------------------------------------------------------------------------
/assets/costs-7b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/costs-7b.png
--------------------------------------------------------------------------------
/assets/overall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/overall.png
--------------------------------------------------------------------------------
/assets/performances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/performances.png
--------------------------------------------------------------------------------
/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-12-53.542208.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2551784.221348991,
9 | "end_time": 2552485.226336305,
10 | "total_evaluation_time_secondes": "701.0049873138778",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "5141f36c632674a8"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "f8465f893370f47e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-36-50.272791.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2552567.750112632,
9 | "end_time": 2553921.892538854,
10 | "total_evaluation_time_secondes": "1354.1424262220971",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.838,
19 | "extractive_match_stderr": 0.016494123566423505
20 | },
21 | "all": {
22 | "extractive_match": 0.838,
23 | "extractive_match_stderr": 0.016494123566423505
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "4e007e6ddae6bd47"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "fa9f7bbd65512b23"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-48-54.788605.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2554015.285077489,
9 | "end_time": 2554646.476080566,
10 | "total_evaluation_time_secondes": "631.1910030771978",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "b589babd0b65f01d"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "605820c49e2b0f60"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-49-09.435323.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2604391.1116118,
9 | "end_time": 2605061.115226257,
10 | "total_evaluation_time_secondes": "670.0036144573241",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "5141f36c632674a8"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "f8465f893370f47e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-13-14.418707.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2605133.897685678,
9 | "end_time": 2606506.01809599,
10 | "total_evaluation_time_secondes": "1372.1204103119671",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.838,
19 | "extractive_match_stderr": 0.016494123566423505
20 | },
21 | "all": {
22 | "extractive_match": 0.838,
23 | "extractive_match_stderr": 0.016494123566423505
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "4e007e6ddae6bd47"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "fa9f7bbd65512b23"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-25-17.786800.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2606610.004414024,
9 | "end_time": 2607229.474309994,
10 | "total_evaluation_time_secondes": "619.4698959700763",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "b589babd0b65f01d"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "605820c49e2b0f60"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-02-07.142924.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2554717.479748211,
9 | "end_time": 2555438.820359841,
10 | "total_evaluation_time_secondes": "721.3406116301194",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "b63238639d2fcf76"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "ecb6fd6a17a1ef8e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-35-55.755190.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2555529.780258504,
9 | "end_time": 2557467.354353926,
10 | "total_evaluation_time_secondes": "1937.5740954219364",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.798,
19 | "extractive_match_stderr": 0.017973260031288248
20 | },
21 | "all": {
22 | "extractive_match": 0.798,
23 | "extractive_match_stderr": 0.017973260031288248
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "eba9b0271e8c8b2f"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "aafc6b7d92bee8a3"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-48-41.771959.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2557548.485465054,
9 | "end_time": 2558233.455578798,
10 | "total_evaluation_time_secondes": "684.9701137440279",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.575,
19 | "extractive_match_stderr": 0.07915823166939519
20 | },
21 | "all": {
22 | "extractive_match": 0.575,
23 | "extractive_match_stderr": 0.07915823166939519
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "ccaa56a9b9fe16b8"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "0acfbefb98e82705"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-37-59.982413.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2607292.420535898,
9 | "end_time": 2607991.665164641,
10 | "total_evaluation_time_secondes": "699.2446287432685",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "b63238639d2fcf76"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "ecb6fd6a17a1ef8e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-02-29.088363.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2558280.776523193,
9 | "end_time": 2559060.750369516,
10 | "total_evaluation_time_secondes": "779.9738463233225",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.26666666666666666,
19 | "extractive_match_stderr": 0.0821175682735253
20 | },
21 | "all": {
22 | "extractive_match": 0.26666666666666666,
23 | "extractive_match_stderr": 0.0821175682735253
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "839a511efa4a6078"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "680d67ec0912d741"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-51-01.120880.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2559129.286908899,
9 | "end_time": 2561972.673322505,
10 | "total_evaluation_time_secondes": "2843.3864136058837",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.756,
19 | "extractive_match_stderr": 0.01922673489361459
20 | },
21 | "all": {
22 | "extractive_match": 0.756,
23 | "extractive_match_stderr": 0.01922673489361459
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "f124f232f03e4b3c"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "2afd387b6678b989"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-04-41.905474.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2562049.988997336,
9 | "end_time": 2562793.583693815,
10 | "total_evaluation_time_secondes": "743.5946964789182",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.575,
19 | "extractive_match_stderr": 0.07915823166939519
20 | },
21 | "all": {
22 | "extractive_match": 0.575,
23 | "extractive_match_stderr": 0.07915823166939519
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "9dd35889080f8eca"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "812448b1bfa8fbc7"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-14-11.184777.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2562859.065263796,
9 | "end_time": 2563362.871206422,
10 | "total_evaluation_time_secondes": "503.8059426257387",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3333333333333333,
19 | "extractive_match_stderr": 0.08753762190648168
20 | },
21 | "all": {
22 | "extractive_match": 0.3333333333333333,
23 | "extractive_match_stderr": 0.08753762190648168
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "57cd2595e3fb4e15"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "2ea2a7e8250f8005"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-36-02.705517.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2563429.852101547,
9 | "end_time": 2564674.316530785,
10 | "total_evaluation_time_secondes": "1244.4644292378798",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.82,
19 | "extractive_match_stderr": 0.01719859247631427
20 | },
21 | "all": {
22 | "extractive_match": 0.82,
23 | "extractive_match_stderr": 0.01719859247631427
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "dda7db86b5891548"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "2ed97aefa363786c"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-44-26.970544.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2564788.62842531,
9 | "end_time": 2565178.658914464,
10 | "total_evaluation_time_secondes": "390.03048915416",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.675,
19 | "extractive_match_stderr": 0.07499999999999998
20 | },
21 | "all": {
22 | "extractive_match": 0.675,
23 | "extractive_match_stderr": 0.07499999999999998
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "fdaf5a9fa19c5c25"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "a46bfd91934aff2c"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-52-04.589810.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2565227.071577089,
9 | "end_time": 2565636.278604793,
10 | "total_evaluation_time_secondes": "409.2070277039893",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.23333333333333334,
19 | "extractive_match_stderr": 0.07854032324531728
20 | },
21 | "all": {
22 | "extractive_match": 0.23333333333333334,
23 | "extractive_match_stderr": 0.07854032324531728
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "17884cbea1d03333"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "887aa0bacc8f8844"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-11-26.478898.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2565700.44115151,
9 | "end_time": 2566798.118010773,
10 | "total_evaluation_time_secondes": "1097.676859262865",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.798,
19 | "extractive_match_stderr": 0.017973260031288265
20 | },
21 | "all": {
22 | "extractive_match": 0.798,
23 | "extractive_match_stderr": 0.017973260031288265
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "6b327eeca1fea757"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "6dbad7c9862d4052"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-18-43.114223.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2566869.350252712,
9 | "end_time": 2567234.803742991,
10 | "total_evaluation_time_secondes": "365.4534902786836",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.675,
19 | "extractive_match_stderr": 0.07499999999999998
20 | },
21 | "all": {
22 | "extractive_match": 0.675,
23 | "extractive_match_stderr": 0.07499999999999998
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "aeca4b45e767c732"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "e916b987c51016d8"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-13-19.506547.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2569877.59925465,
9 | "end_time": 2570511.193301663,
10 | "total_evaluation_time_secondes": "633.5940470127389",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3333333333333333,
19 | "extractive_match_stderr": 0.08753762190648169
20 | },
21 | "all": {
22 | "extractive_match": 0.3333333333333333,
23 | "extractive_match_stderr": 0.08753762190648169
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "9d95073fa199fea1"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "9581b1a436f91769"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-35-14.751095.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2570571.236023537,
9 | "end_time": 2571826.342473873,
10 | "total_evaluation_time_secondes": "1255.106450336054",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.856,
19 | "extractive_match_stderr": 0.015716934945725767
20 | },
21 | "all": {
22 | "extractive_match": 0.856,
23 | "extractive_match_stderr": 0.015716934945725767
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "ce035e0da82bf62c"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "185044a2d01d14ca"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-44-39.651229.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2571904.690897501,
9 | "end_time": 2572391.342055246,
10 | "total_evaluation_time_secondes": "486.6511577446945",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "806ab92dedff279f"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "a0e7c914c07ced33"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-54-03.406254.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2572478.041585189,
9 | "end_time": 2572955.095596673,
10 | "total_evaluation_time_secondes": "477.0540114841424",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.36666666666666664,
19 | "extractive_match_stderr": 0.0894855453983996
20 | },
21 | "all": {
22 | "extractive_match": 0.36666666666666664,
23 | "extractive_match_stderr": 0.0894855453983996
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "39278bd920db3487"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "b9f7ed706698f186"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-13-57.844846.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2573032.477990599,
9 | "end_time": 2574149.485158068,
10 | "total_evaluation_time_secondes": "1117.0071674692445",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.854,
19 | "extractive_match_stderr": 0.01580720517583485
20 | },
21 | "all": {
22 | "extractive_match": 0.854,
23 | "extractive_match_stderr": 0.01580720517583485
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "0e843829e85cfe94"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "db7b20002a364074"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-20-07.618719.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2574199.530597124,
9 | "end_time": 2574519.310388266,
10 | "total_evaluation_time_secondes": "319.77979114232585",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.75,
19 | "extractive_match_stderr": 0.06933752452815363
20 | },
21 | "all": {
22 | "extractive_match": 0.75,
23 | "extractive_match_stderr": 0.06933752452815363
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "6a096939eeb8097d"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "bdec73a60303c3fc"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-31-02.871828.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2574556.171576032,
9 | "end_time": 2575174.544545133,
10 | "total_evaluation_time_secondes": "618.3729691011831",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.2,
19 | "extractive_match_stderr": 0.07427813527082075
20 | },
21 | "all": {
22 | "extractive_match": 0.2,
23 | "extractive_match_stderr": 0.07427813527082075
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "6077ed83d15d6889"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "c1a9ed1d84a0d925"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-51-04.119358.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2575244.022829783,
9 | "end_time": 2576375.754693394,
10 | "total_evaluation_time_secondes": "1131.7318636109121",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.8,
19 | "extractive_match_stderr": 0.017906459241433855
20 | },
21 | "all": {
22 | "extractive_match": 0.8,
23 | "extractive_match_stderr": 0.017906459241433855
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "758890b1dfee93f1"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "6da72493ad7e20fa"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-00-23.069698.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2576450.971500249,
9 | "end_time": 2576934.750199688,
10 | "total_evaluation_time_secondes": "483.77869943901896",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.575,
19 | "extractive_match_stderr": 0.07915823166939519
20 | },
21 | "all": {
22 | "extractive_match": 0.575,
23 | "extractive_match_stderr": 0.07915823166939519
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "56e8f52dea2b9ec6"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "6f3eceda2cec8b33"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-09-49.021006.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2576972.829442079,
9 | "end_time": 2577500.710391771,
10 | "total_evaluation_time_secondes": "527.8809496918693",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "9dea059f367c048a"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "e716ced2d7d9fa84"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-28-55.702973.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2577587.191756647,
9 | "end_time": 2578647.345242229,
10 | "total_evaluation_time_secondes": "1060.1534855817445",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.826,
19 | "extractive_match_stderr": 0.016971271257516147
20 | },
21 | "all": {
22 | "extractive_match": 0.826,
23 | "extractive_match_stderr": 0.016971271257516147
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "7b5df8b0938240b1"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "e53039268588162c"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-34-45.013493.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2578699.013531457,
9 | "end_time": 2578996.706382657,
10 | "total_evaluation_time_secondes": "297.6928511997685",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.65,
19 | "extractive_match_stderr": 0.07637626158259733
20 | },
21 | "all": {
22 | "extractive_match": 0.65,
23 | "extractive_match_stderr": 0.07637626158259733
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "0a77fd8b65d5693c"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "a169c269d2eaaac4"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-42-52.541003.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2579092.107139461,
9 | "end_time": 2579484.220244874,
10 | "total_evaluation_time_secondes": "392.1131054125726",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.2,
19 | "extractive_match_stderr": 0.07427813527082076
20 | },
21 | "all": {
22 | "extractive_match": 0.2,
23 | "extractive_match_stderr": 0.07427813527082076
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "8a299fcdd85d83a4"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "d5533616a87b80ed"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-56-52.023106.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2579564.124064897,
9 | "end_time": 2580323.670382602,
10 | "total_evaluation_time_secondes": "759.5463177049533",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.824,
19 | "extractive_match_stderr": 0.017047852020622263
20 | },
21 | "all": {
22 | "extractive_match": 0.824,
23 | "extractive_match_stderr": 0.017047852020622263
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "b2d7391eac1f81f2"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "8e0031f22a0aa723"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-04-05.239254.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2580424.328730572,
9 | "end_time": 2580756.931524183,
10 | "total_evaluation_time_secondes": "332.60279361112043",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.675,
19 | "extractive_match_stderr": 0.07499999999999998
20 | },
21 | "all": {
22 | "extractive_match": 0.675,
23 | "extractive_match_stderr": 0.07499999999999998
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "b38129b60c769bd6"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "4a03618c47084d90"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-10-27.028672.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2580812.042311833,
9 | "end_time": 2581138.714750407,
10 | "total_evaluation_time_secondes": "326.67243857402354",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3333333333333333,
19 | "extractive_match_stderr": 0.0875376219064817
20 | },
21 | "all": {
22 | "extractive_match": 0.3333333333333333,
23 | "extractive_match_stderr": 0.0875376219064817
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "314d40cde0446e00"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "83c2fe7147a45ed0"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-24-32.807123.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2581209.555826082,
9 | "end_time": 2581984.464070361,
10 | "total_evaluation_time_secondes": "774.9082442792132",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.814,
19 | "extractive_match_stderr": 0.01741880678058394
20 | },
21 | "all": {
22 | "extractive_match": 0.814,
23 | "extractive_match_stderr": 0.01741880678058394
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "a050975778a6be25"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "20a4136b5748e4f1"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-29-47.478300.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2582057.657695042,
9 | "end_time": 2582299.170243019,
10 | "total_evaluation_time_secondes": "241.51254797680303",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.625,
19 | "extractive_match_stderr": 0.07752170911825529
20 | },
21 | "all": {
22 | "extractive_match": 0.625,
23 | "extractive_match_stderr": 0.07752170911825529
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "02bf8270f672d019"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "9bcd38a007e3c1f7"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-36-30.994685.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2582347.378834666,
9 | "end_time": 2582702.685794419,
10 | "total_evaluation_time_secondes": "355.30695975292474",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.26666666666666666,
19 | "extractive_match_stderr": 0.0821175682735253
20 | },
21 | "all": {
22 | "extractive_match": 0.26666666666666666,
23 | "extractive_match_stderr": 0.0821175682735253
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "77b972bf9c1cfb26"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "585ab41a6bd09268"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-49-32.062451.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2582781.263596611,
9 | "end_time": 2583483.690574673,
10 | "total_evaluation_time_secondes": "702.426978061907",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.822,
19 | "extractive_match_stderr": 0.017123622189062257
20 | },
21 | "all": {
22 | "extractive_match": 0.822,
23 | "extractive_match_stderr": 0.017123622189062257
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "36524c3d10393f7f"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "e4920254817e7bfc"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-55-51.416032.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2583567.839449735,
9 | "end_time": 2583863.106798447,
10 | "total_evaluation_time_secondes": "295.2673487123102",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "f52cd39075e5792e"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "17e169df3b2c15f2"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-30-59.655917.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2567298.375852412,
9 | "end_time": 2567971.3367268,
10 | "total_evaluation_time_secondes": "672.9608743879944",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "76473421bbe93410"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "a2125e595bab4673"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-53-53.011460.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2568037.999388426,
9 | "end_time": 2569344.647350468,
10 | "total_evaluation_time_secondes": "1306.6479620421305",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.854,
19 | "extractive_match_stderr": 0.01580720517583485
20 | },
21 | "all": {
22 | "extractive_match": 0.854,
23 | "extractive_match_stderr": 0.01580720517583485
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "961808827104c739"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "0a4348ce590f191d"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-01-54.627135.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2569429.209718564,
9 | "end_time": 2569826.319747058,
10 | "total_evaluation_time_secondes": "397.11002849414945",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.8,
19 | "extractive_match_stderr": 0.06405126152203487
20 | },
21 | "all": {
22 | "extractive_match": 0.8,
23 | "extractive_match_stderr": 0.06405126152203487
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "d1d854bc50c70679"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "198d7b3d60a17a97"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-52-22.004802.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2586637.696856398,
9 | "end_time": 2587253.686011266,
10 | "total_evaluation_time_secondes": "615.9891548678279",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.08509629433967632
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.08509629433967632
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "f899e2531ad5b6e1"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "a424417871d83f65"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-16-52.530386.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2587310.198941251,
9 | "end_time": 2588724.15183972,
10 | "total_evaluation_time_secondes": "1413.952898469288",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.844,
19 | "extractive_match_stderr": 0.016243636028391104
20 | },
21 | "all": {
22 | "extractive_match": 0.844,
23 | "extractive_match_stderr": 0.016243636028391104
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "0ff22df8a2e32ff6"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "13d8e0072ee283ae"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-26-36.766492.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2588822.652426989,
9 | "end_time": 2589308.453780031,
10 | "total_evaluation_time_secondes": "485.80135304201394",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "cff1c1f58a10c3ec"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "9f14f44a8006a380"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-36-59.825027.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2589368.024663066,
9 | "end_time": 2589931.508379323,
10 | "total_evaluation_time_secondes": "563.4837162569165",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.43333333333333335,
19 | "extractive_match_stderr": 0.0920186554465537
20 | },
21 | "all": {
22 | "extractive_match": 0.43333333333333335,
23 | "extractive_match_stderr": 0.0920186554465537
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "5f98c2f013b48f03"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "ed47f783736df3bf"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-55-14.957560.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2589983.778860533,
9 | "end_time": 2591026.655002043,
10 | "total_evaluation_time_secondes": "1042.8761415099725",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.856,
19 | "extractive_match_stderr": 0.015716934945725767
20 | },
21 | "all": {
22 | "extractive_match": 0.856,
23 | "extractive_match_stderr": 0.015716934945725767
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "0fab9c626e9e9390"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "a22503828d7d43e3"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-04-33.528650.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2591099.695949811,
9 | "end_time": 2591585.220585029,
10 | "total_evaluation_time_secondes": "485.52463521808386",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.75,
19 | "extractive_match_stderr": 0.06933752452815363
20 | },
21 | "all": {
22 | "extractive_match": 0.75,
23 | "extractive_match_stderr": 0.06933752452815363
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "1b6a1b4002eec6c6"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "21b4cbba6be064a5"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-33-42.826955.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2592299.508424689,
9 | "end_time": 2593334.475003944,
10 | "total_evaluation_time_secondes": "1034.9665792547166",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.852,
19 | "extractive_match_stderr": 0.015896458561251246
20 | },
21 | "all": {
22 | "extractive_match": 0.852,
23 | "extractive_match_stderr": 0.015896458561251246
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "e3e867c652ccb694"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "d6ee582ea943bd6e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-39-54.446838.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2593401.704100977,
9 | "end_time": 2593706.140676645,
10 | "total_evaluation_time_secondes": "304.43657566793263",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.775,
19 | "extractive_match_stderr": 0.06686668711812967
20 | },
21 | "all": {
22 | "extractive_match": 0.775,
23 | "extractive_match_stderr": 0.06686668711812967
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "e1ea54b6a1bf95b4"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "a71856567a33efac"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-50-49.693635.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2593753.251117188,
9 | "end_time": 2594361.376800723,
10 | "total_evaluation_time_secondes": "608.1256835348904",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3333333333333333,
19 | "extractive_match_stderr": 0.08753762190648169
20 | },
21 | "all": {
22 | "extractive_match": 0.3333333333333333,
23 | "extractive_match_stderr": 0.08753762190648169
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "490dc46f67c29942"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "18c8551d8a1015a6"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-11-55.345666.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2594416.381958949,
9 | "end_time": 2595626.985790251,
10 | "total_evaluation_time_secondes": "1210.603831301909",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.804,
19 | "extractive_match_stderr": 0.017770751227744862
20 | },
21 | "all": {
22 | "extractive_match": 0.804,
23 | "extractive_match_stderr": 0.017770751227744862
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "1678101c78f1f285"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "9f08e5d1c86fbf9c"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-22-45.088452.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2595694.940795575,
9 | "end_time": 2596276.773000479,
10 | "total_evaluation_time_secondes": "581.8322049044073",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.6,
19 | "extractive_match_stderr": 0.07844645405527362
20 | },
21 | "all": {
22 | "extractive_match": 0.6,
23 | "extractive_match_stderr": 0.07844645405527362
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "597b0bb4fc1adfd9"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "66cde9f29d4c2f5c"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-33-37.815028.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2596337.815541219,
9 | "end_time": 2596929.497024281,
10 | "total_evaluation_time_secondes": "591.6814830619842",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.26666666666666666,
19 | "extractive_match_stderr": 0.0821175682735253
20 | },
21 | "all": {
22 | "extractive_match": 0.26666666666666666,
23 | "extractive_match_stderr": 0.0821175682735253
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "d893bd40128f3f01"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "304e7ec495522d6e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-56-26.434504.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2596993.780300828,
9 | "end_time": 2598298.12918594,
10 | "total_evaluation_time_secondes": "1304.3488851119764",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.83,
19 | "extractive_match_stderr": 0.016815633531393415
20 | },
21 | "all": {
22 | "extractive_match": 0.83,
23 | "extractive_match_stderr": 0.016815633531393415
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "2d8a8b96257009fe"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "69e17b0bd9a55619"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-07-33.637126.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2598411.082974548,
9 | "end_time": 2598965.327884916,
10 | "total_evaluation_time_secondes": "554.2449103682302",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "40d9bb73f755c8ae"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "0f0dc5923fa3d806"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-45-32.618178.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2599724.580276648,
9 | "end_time": 2601244.233675632,
10 | "total_evaluation_time_secondes": "1519.6533989841118",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.814,
19 | "extractive_match_stderr": 0.017418806780583957
20 | },
21 | "all": {
22 | "extractive_match": 0.814,
23 | "extractive_match_stderr": 0.017418806780583957
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "966cfae3386821e2"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "06e0fffe6fd58548"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-54-08.288591.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2601346.196358103,
9 | "end_time": 2601759.980020389,
10 | "total_evaluation_time_secondes": "413.7836622861214",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "3d0089b1cb6faa2d"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "adf30c7654ed9c16"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-02-26.672622.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2601826.367747913,
9 | "end_time": 2602258.347004526,
10 | "total_evaluation_time_secondes": "431.9792566127144",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "699ac3b1b5301ac1"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "0245497f2bebbcc5"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-24-00.237541.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2602298.434195373,
9 | "end_time": 2603551.813672935,
10 | "total_evaluation_time_secondes": "1253.3794775619172",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.824,
19 | "extractive_match_stderr": 0.01704785202062228
20 | },
21 | "all": {
22 | "extractive_match": 0.824,
23 | "extractive_match_stderr": 0.01704785202062228
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "17b24f3c520b6310"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "fcb10a4340d55cf4"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-32-38.296980.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2603652.439867107,
9 | "end_time": 2604069.984818537,
10 | "total_evaluation_time_secondes": "417.5449514295906",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.675,
19 | "extractive_match_stderr": 0.07499999999999998
20 | },
21 | "all": {
22 | "extractive_match": 0.675,
23 | "extractive_match_stderr": 0.07499999999999998
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "3e0fd04e750b02c0"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "02c3a8d49a2e3ee3"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-07-05.491769.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2583924.187252476,
9 | "end_time": 2584537.177876145,
10 | "total_evaluation_time_secondes": "612.9906236692332",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.4666666666666667,
19 | "extractive_match_stderr": 0.09264111117062017
20 | },
21 | "all": {
22 | "extractive_match": 0.4666666666666667,
23 | "extractive_match_stderr": 0.09264111117062017
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "fd81113ccd505a4e"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "2a36a200d6cd2247"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-31-16.022306.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2584606.174923473,
9 | "end_time": 2585987.719419564,
10 | "total_evaluation_time_secondes": "1381.5444960910827",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.844,
19 | "extractive_match_stderr": 0.016243636028391097
20 | },
21 | "all": {
22 | "extractive_match": 0.844,
23 | "extractive_match_stderr": 0.016243636028391097
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "6f3bc11d908efbf4"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "c2619f7c35dda61a"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-41-29.644364.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 2586061.970029914,
9 | "end_time": 2586601.331146573,
10 | "total_evaluation_time_secondes": "539.3611166593619",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.725,
19 | "extractive_match_stderr": 0.0714995069016527
20 | },
21 | "all": {
22 | "extractive_match": 0.725,
23 | "extractive_match_stderr": 0.0714995069016527
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "c0838ad8f27af064"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "68bf0a6b43ec34ae"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T18-04-10.796422.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9795607.890673576,
9 | "end_time": 9796266.243738612,
10 | "total_evaluation_time_secondes": "658.3530650362372",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "5141f36c632674a8"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "f8465f893370f47e"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T18-27-09.526559.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9796309.327618744,
9 | "end_time": 9797644.914942032,
10 | "total_evaluation_time_secondes": "1335.587323287502",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.838,
19 | "extractive_match_stderr": 0.016494123566423505
20 | },
21 | "all": {
22 | "extractive_match": 0.838,
23 | "extractive_match_stderr": 0.016494123566423505
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "4e007e6ddae6bd47"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "fa9f7bbd65512b23"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T18-37-46.790845.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9797696.616540272,
9 | "end_time": 9798282.237784388,
10 | "total_evaluation_time_secondes": "585.621244115755",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.7,
19 | "extractive_match_stderr": 0.07337993857053426
20 | },
21 | "all": {
22 | "extractive_match": 0.7,
23 | "extractive_match_stderr": 0.07337993857053426
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "b589babd0b65f01d"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "605820c49e2b0f60"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T20-26-02.445125.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9804127.746607196,
9 | "end_time": 9804777.889406092,
10 | "total_evaluation_time_secondes": "650.142798896879",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|aime24|0": {
18 | "extractive_match": 0.3,
19 | "extractive_match_stderr": 0.0850962943396763
20 | },
21 | "all": {
22 | "extractive_match": 0.3,
23 | "extractive_match_stderr": 0.0850962943396763
24 | }
25 | },
26 | "versions": {
27 | "custom|aime24|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|aime24": {
31 | "name": "aime24",
32 | "prompt_function": "aime_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/aime_2024",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 30,
64 | "effective_num_docs": 30,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|aime24|0": {
71 | "hashes": {
72 | "hash_examples": "ddec8fc79d0a014b",
73 | "hash_full_prompts": "253167becf0dfed7",
74 | "hash_input_tokens": "bf1cc75b5f12dfb8",
75 | "hash_cont_tokens": "76473421bbe93410"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 30,
79 | "padded": 0,
80 | "non_padded": 30,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "c903e836a519cf98",
88 | "hash_full_prompts": "84ff409b6bbf7cc0",
89 | "hash_input_tokens": "9a8c7e54ce09af84",
90 | "hash_cont_tokens": "a2125e595bab4673"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 30,
94 | "padded": 0,
95 | "non_padded": 30,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T20-48-43.218853.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9804830.717740912,
9 | "end_time": 9806138.61057854,
10 | "total_evaluation_time_secondes": "1307.8928376287222",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.854,
19 | "extractive_match_stderr": 0.01580720517583485
20 | },
21 | "all": {
22 | "extractive_match": 0.854,
23 | "extractive_match_stderr": 0.01580720517583485
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "961808827104c739"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "0a4348ce590f191d"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T20-56-24.803840.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9806195.692178983,
9 | "end_time": 9806600.257096833,
10 | "total_evaluation_time_secondes": "404.5649178493768",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.8,
19 | "extractive_match_stderr": 0.06405126152203487
20 | },
21 | "all": {
22 | "extractive_match": 0.8,
23 | "extractive_match_stderr": 0.06405126152203487
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "d1d854bc50c70679"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "198d7b3d60a17a97"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T23-02-19.308882.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9812781.11633004,
9 | "end_time": 9814154.770996448,
10 | "total_evaluation_time_secondes": "1373.6546664070338",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|math_500|0": {
18 | "extractive_match": 0.844,
19 | "extractive_match_stderr": 0.016243636028391097
20 | },
21 | "all": {
22 | "extractive_match": 0.844,
23 | "extractive_match_stderr": 0.016243636028391097
24 | }
25 | },
26 | "versions": {
27 | "custom|math_500|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|math_500": {
31 | "name": "math_500",
32 | "prompt_function": "math_prompt_fn",
33 | "hf_repo": "HuggingFaceH4/MATH-500",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "test"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "test"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 500,
64 | "effective_num_docs": 500,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|math_500|0": {
71 | "hashes": {
72 | "hash_examples": "adf0cc8311011db2",
73 | "hash_full_prompts": "63c902dbdbaf1552",
74 | "hash_input_tokens": "2af397a095a31139",
75 | "hash_cont_tokens": "6f3bc11d908efbf4"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 500,
79 | "padded": 0,
80 | "non_padded": 500,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "bfaad1993ff37a60",
88 | "hash_full_prompts": "3ceaaade5cf43911",
89 | "hash_input_tokens": "c663dbac8a64d3e4",
90 | "hash_cont_tokens": "c2619f7c35dda61a"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 500,
94 | "padded": 0,
95 | "non_padded": 500,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/logs/overall/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T23-12-28.069489.json:
--------------------------------------------------------------------------------
1 | {
2 | "config_general": {
3 | "lighteval_sha": "?",
4 | "num_fewshot_seeds": 1,
5 | "override_batch_size": -1,
6 | "max_samples": null,
7 | "job_id": 0,
8 | "start_time": 9814218.648552556,
9 | "end_time": 9814763.480200425,
10 | "total_evaluation_time_secondes": "544.8316478691995",
11 | "model_name": "quyanh/OpenRS-GRPO",
12 | "model_sha": "",
13 | "model_dtype": null,
14 | "model_size": null
15 | },
16 | "results": {
17 | "custom|amc23|0": {
18 | "extractive_match": 0.725,
19 | "extractive_match_stderr": 0.0714995069016527
20 | },
21 | "all": {
22 | "extractive_match": 0.725,
23 | "extractive_match_stderr": 0.0714995069016527
24 | }
25 | },
26 | "versions": {
27 | "custom|amc23|0": 1
28 | },
29 | "config_tasks": {
30 | "custom|amc23": {
31 | "name": "amc23",
32 | "prompt_function": "amc_prompt_fn",
33 | "hf_repo": "knoveleng/AMC-23",
34 | "hf_subset": "default",
35 | "metric": [
36 | {
37 | "metric_name": "extractive_match",
38 | "higher_is_better": true,
39 | "category": "3",
40 | "use_case": "1",
41 | "sample_level_fn": "sample_level_fn",
42 | "corpus_level_fn": "mean"
43 | }
44 | ],
45 | "hf_revision": null,
46 | "hf_filter": null,
47 | "hf_avail_splits": [
48 | "train"
49 | ],
50 | "trust_dataset": false,
51 | "evaluation_splits": [
52 | "train"
53 | ],
54 | "few_shots_split": null,
55 | "few_shots_select": null,
56 | "generation_size": 32768,
57 | "generation_grammar": null,
58 | "stop_sequence": [],
59 | "num_samples": null,
60 | "suite": [
61 | "custom"
62 | ],
63 | "original_num_docs": 40,
64 | "effective_num_docs": 40,
65 | "must_remove_duplicate_docs": false,
66 | "version": 1
67 | }
68 | },
69 | "summary_tasks": {
70 | "custom|amc23|0": {
71 | "hashes": {
72 | "hash_examples": "57f3ead69f601b6a",
73 | "hash_full_prompts": "64c05856286ef8dc",
74 | "hash_input_tokens": "b64afe4485ef61c0",
75 | "hash_cont_tokens": "c0838ad8f27af064"
76 | },
77 | "truncated": 0,
78 | "non_truncated": 40,
79 | "padded": 0,
80 | "non_padded": 40,
81 | "effective_few_shots": 0.0,
82 | "num_truncated_few_shots": 0
83 | }
84 | },
85 | "summary_general": {
86 | "hashes": {
87 | "hash_examples": "fe1f1f4512256bec",
88 | "hash_full_prompts": "882107366cadc5ce",
89 | "hash_input_tokens": "638cd1079164f374",
90 | "hash_cont_tokens": "68bf0a6b43ec34ae"
91 | },
92 | "truncated": 0,
93 | "non_truncated": 40,
94 | "padded": 0,
95 | "non_padded": 40,
96 | "num_truncated_few_shots": 0
97 | }
98 | }
--------------------------------------------------------------------------------
/recipes/accelerate_configs/ddp.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: MULTI_GPU
4 | downcast_bf16: 'no'
5 | gpu_ids: all
6 | machine_rank: 0
7 | main_training_function: main
8 | mixed_precision: bf16
9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 |
--------------------------------------------------------------------------------
/recipes/accelerate_configs/fsdp.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: FSDP
4 | downcast_bf16: 'no'
5 | enable_cpu_affinity: false
6 | fsdp_config:
7 | fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
9 | fsdp_backward_prefetch: BACKWARD_PRE
10 | fsdp_cpu_ram_efficient_loading: true
11 | fsdp_forward_prefetch: true
12 | fsdp_offload_params: false
13 | fsdp_sharding_strategy: FULL_SHARD
14 | fsdp_state_dict_type: FULL_STATE_DICT
15 | fsdp_sync_module_states: true
16 | fsdp_use_orig_params: true
17 | machine_rank: 0
18 | main_training_function: main
19 | mixed_precision: bf16
20 | num_machines: 1
21 | num_processes: 8
22 | rdzv_backend: static
23 | same_network: true
24 | tpu_env: []
25 | tpu_use_cluster: false
26 | tpu_use_sudo: false
27 | use_cpu: false
--------------------------------------------------------------------------------
/recipes/accelerate_configs/zero2.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | offload_optimizer_device: none
6 | offload_param_device: none
7 | zero3_init_flag: false
8 | zero_stage: 2
9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false
--------------------------------------------------------------------------------
/recipes/accelerate_configs/zero3.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | offload_optimizer_device: none
6 | offload_param_device: none
7 | zero3_init_flag: true
8 | zero3_save_16bit_model: true
9 | zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 |
--------------------------------------------------------------------------------
/recipes/data_cleaner.yaml:
--------------------------------------------------------------------------------
1 | model_kwargs:
2 | model: Qwen/Qwen2.5-Math-7B-Instruct
3 | trust_remote_code: true
4 | max_model_len: 4096
5 | gpu_memory_utilization: 0.9
6 | enforce_eager: true
7 | tensor_parallel_size: 4
8 |
9 | sampling_params:
10 | temperature: 0.7
11 | top_p: 0.9
12 | max_tokens: 4096
13 |
--------------------------------------------------------------------------------
/recipes/grpo.yaml:
--------------------------------------------------------------------------------
1 | # Model arguments
2 | model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
3 | model_revision: main
4 | torch_dtype: bfloat16
5 | attn_implementation: flash_attention_2
6 |
7 | # Data training arguments
8 | dataset_name: knoveleng/open-rs
9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages."
10 |
11 | # GRPO trainer config
12 | bf16: true
13 | use_vllm: true
14 | vllm_device: auto
15 | vllm_enforce_eager: true
16 | vllm_gpu_memory_utilization: 0.7
17 | vllm_max_model_len: 4608
18 | do_eval: false
19 | gradient_accumulation_steps: 4
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 | use_reentrant: false
23 | hub_model_id: OpenRS-GRPO
24 | hub_strategy: every_save
25 | learning_rate: 1.0e-06
26 | log_completions: true
27 | log_level: info
28 | logging_first_step: true
29 | logging_steps: 1
30 | logging_strategy: steps
31 | lr_scheduler_type: cosine_with_min_lr
32 | lr_scheduler_kwargs:
33 | min_lr_rate: 0.1
34 | max_prompt_length: 512
35 | max_completion_length: 3584
36 | max_steps: 500
37 | num_generations: 6
38 | num_train_epochs: 1
39 | output_dir: data/OpenRS-GRPO
40 | overwrite_output_dir: true
41 | per_device_eval_batch_size: 6
42 | per_device_train_batch_size: 6
43 | push_to_hub: true
44 | report_to:
45 | - wandb
46 | reward_funcs:
47 | - format
48 | - cosine
49 | reward_weights:
50 | - 1.0
51 | - 2.0
52 | save_strategy: "steps"
53 | save_steps: 50
54 | seed: 42
55 | temperature: 0.7
56 | warmup_ratio: 0.1
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | default_section = FIRSTPARTY
3 | ensure_newline_before_comments = True
4 | force_grid_wrap = 0
5 | include_trailing_comma = True
6 | known_first_party = open_r1
7 | known_third_party =
8 | transformers
9 | datasets
10 | fugashi
11 | git
12 | h5py
13 | matplotlib
14 | nltk
15 | numpy
16 | packaging
17 | pandas
18 | psutil
19 | pytest
20 | rouge_score
21 | sacrebleu
22 | seqeval
23 | sklearn
24 | streamlit
25 | torch
26 | tqdm
27 |
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 |
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 | # imported but unused
38 | __init__.py: F401
39 |
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
--------------------------------------------------------------------------------
/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/src/open_r1/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .import_utils import is_e2b_available
2 | from .model_utils import get_tokenizer
3 |
4 |
5 | __all__ = ["get_tokenizer", "is_e2b_available"]
6 |
--------------------------------------------------------------------------------
/src/open_r1/utils/import_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from transformers.utils.import_utils import _is_package_available
16 |
17 |
18 | # Use same as transformers.utils.import_utils
19 | _e2b_available = _is_package_available("e2b")
20 |
21 |
22 | def is_e2b_available() -> bool:
23 | return _e2b_available
24 |
--------------------------------------------------------------------------------
/src/open_r1/utils/model_utils.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer, PreTrainedTokenizer
2 |
3 | from trl import ModelConfig
4 |
5 | from ..configs import GRPOConfig, SFTConfig
6 |
7 |
8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
9 |
10 |
11 | def get_tokenizer(
12 | model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True
13 | ) -> PreTrainedTokenizer:
14 | """Get the tokenizer for the model."""
15 | tokenizer = AutoTokenizer.from_pretrained(
16 | model_args.model_name_or_path,
17 | revision=model_args.model_revision,
18 | trust_remote_code=model_args.trust_remote_code,
19 | )
20 |
21 | if training_args.chat_template is not None:
22 | tokenizer.chat_template = training_args.chat_template
23 | elif auto_set_chat_template and tokenizer.get_chat_template() is None:
24 | tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
25 |
26 | return tokenizer
27 |
--------------------------------------------------------------------------------
/src/open_r1/utils/wandb_logging.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def init_wandb_training(training_args):
5 | """
6 | Helper function for setting up Weights & Biases logging tools.
7 | """
8 | if training_args.wandb_entity is not None:
9 | os.environ["WANDB_ENTITY"] = training_args.wandb_entity
10 | if training_args.wandb_project is not None:
11 | os.environ["WANDB_PROJECT"] = training_args.wandb_project
12 |
--------------------------------------------------------------------------------