├── .gitignore ├── LICENSE ├── README.md ├── assets ├── costs-1.5b.png ├── costs-7b.png ├── overall.png └── performances.png ├── eval.sh ├── logs ├── evals │ ├── Exp1_100 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-14T23-12-53.542208.json │ │ │ ├── results_2025-03-14T23-36-50.272791.json │ │ │ ├── results_2025-03-14T23-48-54.788605.json │ │ │ ├── results_2025-03-15T13-49-09.435323.json │ │ │ ├── results_2025-03-15T14-13-14.418707.json │ │ │ └── results_2025-03-15T14-25-17.786800.json │ ├── Exp1_200 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T00-02-07.142924.json │ │ │ ├── results_2025-03-15T00-35-55.755190.json │ │ │ ├── results_2025-03-15T00-48-41.771959.json │ │ │ └── results_2025-03-15T14-37-59.982413.json │ ├── Exp1_300 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T01-02-29.088363.json │ │ │ ├── results_2025-03-15T01-51-01.120880.json │ │ │ └── results_2025-03-15T02-04-41.905474.json │ ├── Exp1_400 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T02-14-11.184777.json │ │ │ ├── results_2025-03-15T02-36-02.705517.json │ │ │ └── results_2025-03-15T02-44-26.970544.json │ ├── Exp1_500 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T02-52-04.589810.json │ │ │ ├── results_2025-03-15T03-11-26.478898.json │ │ │ └── results_2025-03-15T03-18-43.114223.json │ ├── Exp2_100 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T04-13-19.506547.json │ │ │ ├── results_2025-03-15T04-35-14.751095.json │ │ │ └── results_2025-03-15T04-44-39.651229.json │ ├── Exp2_150 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T04-54-03.406254.json │ │ │ ├── results_2025-03-15T05-13-57.844846.json │ │ │ └── results_2025-03-15T05-20-07.618719.json │ ├── Exp2_200 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T05-31-02.871828.json │ │ │ ├── results_2025-03-15T05-51-04.119358.json │ │ │ └── results_2025-03-15T06-00-23.069698.json │ ├── Exp2_250 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T06-09-49.021006.json │ │ │ ├── results_2025-03-15T06-28-55.702973.json │ │ │ └── results_2025-03-15T06-34-45.013493.json │ ├── Exp2_300 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T06-42-52.541003.json │ │ │ ├── results_2025-03-15T06-56-52.023106.json │ │ │ └── results_2025-03-15T07-04-05.239254.json │ ├── Exp2_350 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T07-10-27.028672.json │ │ │ ├── results_2025-03-15T07-24-32.807123.json │ │ │ └── results_2025-03-15T07-29-47.478300.json │ ├── Exp2_400 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T07-36-30.994685.json │ │ │ ├── results_2025-03-15T07-49-32.062451.json │ │ │ └── results_2025-03-15T07-55-51.416032.json │ ├── Exp2_50 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T03-30-59.655917.json │ │ │ ├── results_2025-03-15T03-53-53.011460.json │ │ │ └── results_2025-03-15T04-01-54.627135.json │ ├── Exp3_100 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T08-52-22.004802.json │ │ │ ├── results_2025-03-15T09-16-52.530386.json │ │ │ └── results_2025-03-15T09-26-36.766492.json │ ├── Exp3_150 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T09-36-59.825027.json │ │ │ ├── results_2025-03-15T09-55-14.957560.json │ │ │ └── results_2025-03-15T10-04-33.528650.json │ ├── Exp3_200 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T10-15-17.421583.json │ │ │ ├── results_2025-03-15T10-33-42.826955.json │ │ │ └── results_2025-03-15T10-39-54.446838.json │ ├── Exp3_250 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T10-50-49.693635.json │ │ │ ├── results_2025-03-15T11-11-55.345666.json │ │ │ └── results_2025-03-15T11-22-45.088452.json │ ├── Exp3_300 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T11-33-37.815028.json │ │ │ ├── results_2025-03-15T11-56-26.434504.json │ │ │ └── results_2025-03-15T12-07-33.637126.json │ ├── Exp3_350 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T12-19-09.426338.json │ │ │ ├── results_2025-03-15T12-45-32.618178.json │ │ │ └── results_2025-03-15T12-54-08.288591.json │ ├── Exp3_400 │ │ └── results │ │ │ └── quyanh │ │ │ └── OpenRS-GRPO │ │ │ ├── results_2025-03-15T13-02-26.672622.json │ │ │ ├── results_2025-03-15T13-24-00.237541.json │ │ │ └── results_2025-03-15T13-32-38.296980.json │ └── Exp3_50 │ │ └── results │ │ └── quyanh │ │ └── OpenRS-GRPO │ │ ├── results_2025-03-15T08-07-05.491769.json │ │ ├── results_2025-03-15T08-31-16.022306.json │ │ └── results_2025-03-15T08-41-29.644364.json ├── logs.txt └── overall │ ├── Exp1_100 │ └── results │ │ └── quyanh │ │ └── OpenRS-GRPO │ │ ├── results_2025-03-15T18-04-10.796422.json │ │ ├── results_2025-03-15T18-27-09.526559.json │ │ ├── results_2025-03-15T18-37-46.790845.json │ │ ├── results_2025-03-15T18-56-57.793761.json │ │ └── results_2025-03-15T20-14-03.383896.json │ ├── Exp2_50 │ └── results │ │ └── quyanh │ │ └── OpenRS-GRPO │ │ ├── results_2025-03-15T20-26-02.445125.json │ │ ├── results_2025-03-15T20-48-43.218853.json │ │ ├── results_2025-03-15T20-56-24.803840.json │ │ ├── results_2025-03-15T21-15-57.124823.json │ │ └── results_2025-03-15T22-27-30.404581.json │ └── Exp3_50 │ └── results │ └── quyanh │ └── OpenRS-GRPO │ ├── results_2025-03-15T22-38-52.294521.json │ ├── results_2025-03-15T23-02-19.308882.json │ ├── results_2025-03-15T23-12-28.069489.json │ ├── results_2025-03-15T23-32-02.283638.json │ └── results_2025-03-16T00-44-02.504832.json ├── recipes ├── accelerate_configs │ ├── ddp.yaml │ ├── fsdp.yaml │ ├── zero2.yaml │ └── zero3.yaml ├── data_cleaner.yaml └── grpo.yaml ├── setup.cfg ├── setup.py └── src └── open_r1 ├── __init__.py ├── configs.py ├── evaluate.py ├── generate.py ├── grpo.py ├── rewards.py ├── sft.py └── utils ├── __init__.py ├── callbacks.py ├── evaluation.py ├── hub.py ├── import_utils.py ├── model_utils.py └── wandb_logging.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Knovel Engineering 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/costs-1.5b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/costs-1.5b.png -------------------------------------------------------------------------------- /assets/costs-7b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/costs-7b.png -------------------------------------------------------------------------------- /assets/overall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/overall.png -------------------------------------------------------------------------------- /assets/performances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knoveleng/open-rs/887a66e4572ceb9f3d41bc49139a7f90832025c8/assets/performances.png -------------------------------------------------------------------------------- /logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-12-53.542208.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2551784.221348991, 9 | "end_time": 2552485.226336305, 10 | "total_evaluation_time_secondes": "701.0049873138778", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "5141f36c632674a8" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "f8465f893370f47e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-36-50.272791.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2552567.750112632, 9 | "end_time": 2553921.892538854, 10 | "total_evaluation_time_secondes": "1354.1424262220971", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.838, 19 | "extractive_match_stderr": 0.016494123566423505 20 | }, 21 | "all": { 22 | "extractive_match": 0.838, 23 | "extractive_match_stderr": 0.016494123566423505 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "4e007e6ddae6bd47" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "fa9f7bbd65512b23" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-14T23-48-54.788605.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2554015.285077489, 9 | "end_time": 2554646.476080566, 10 | "total_evaluation_time_secondes": "631.1910030771978", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "b589babd0b65f01d" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "605820c49e2b0f60" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-49-09.435323.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2604391.1116118, 9 | "end_time": 2605061.115226257, 10 | "total_evaluation_time_secondes": "670.0036144573241", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "5141f36c632674a8" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "f8465f893370f47e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-13-14.418707.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2605133.897685678, 9 | "end_time": 2606506.01809599, 10 | "total_evaluation_time_secondes": "1372.1204103119671", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.838, 19 | "extractive_match_stderr": 0.016494123566423505 20 | }, 21 | "all": { 22 | "extractive_match": 0.838, 23 | "extractive_match_stderr": 0.016494123566423505 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "4e007e6ddae6bd47" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "fa9f7bbd65512b23" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-25-17.786800.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2606610.004414024, 9 | "end_time": 2607229.474309994, 10 | "total_evaluation_time_secondes": "619.4698959700763", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "b589babd0b65f01d" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "605820c49e2b0f60" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-02-07.142924.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2554717.479748211, 9 | "end_time": 2555438.820359841, 10 | "total_evaluation_time_secondes": "721.3406116301194", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "b63238639d2fcf76" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "ecb6fd6a17a1ef8e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-35-55.755190.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2555529.780258504, 9 | "end_time": 2557467.354353926, 10 | "total_evaluation_time_secondes": "1937.5740954219364", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.798, 19 | "extractive_match_stderr": 0.017973260031288248 20 | }, 21 | "all": { 22 | "extractive_match": 0.798, 23 | "extractive_match_stderr": 0.017973260031288248 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "eba9b0271e8c8b2f" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "aafc6b7d92bee8a3" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T00-48-41.771959.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2557548.485465054, 9 | "end_time": 2558233.455578798, 10 | "total_evaluation_time_secondes": "684.9701137440279", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.575, 19 | "extractive_match_stderr": 0.07915823166939519 20 | }, 21 | "all": { 22 | "extractive_match": 0.575, 23 | "extractive_match_stderr": 0.07915823166939519 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "ccaa56a9b9fe16b8" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "0acfbefb98e82705" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T14-37-59.982413.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2607292.420535898, 9 | "end_time": 2607991.665164641, 10 | "total_evaluation_time_secondes": "699.2446287432685", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "b63238639d2fcf76" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "ecb6fd6a17a1ef8e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-02-29.088363.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2558280.776523193, 9 | "end_time": 2559060.750369516, 10 | "total_evaluation_time_secondes": "779.9738463233225", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.26666666666666666, 19 | "extractive_match_stderr": 0.0821175682735253 20 | }, 21 | "all": { 22 | "extractive_match": 0.26666666666666666, 23 | "extractive_match_stderr": 0.0821175682735253 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "839a511efa4a6078" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "680d67ec0912d741" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T01-51-01.120880.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2559129.286908899, 9 | "end_time": 2561972.673322505, 10 | "total_evaluation_time_secondes": "2843.3864136058837", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.756, 19 | "extractive_match_stderr": 0.01922673489361459 20 | }, 21 | "all": { 22 | "extractive_match": 0.756, 23 | "extractive_match_stderr": 0.01922673489361459 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "f124f232f03e4b3c" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "2afd387b6678b989" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-04-41.905474.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2562049.988997336, 9 | "end_time": 2562793.583693815, 10 | "total_evaluation_time_secondes": "743.5946964789182", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.575, 19 | "extractive_match_stderr": 0.07915823166939519 20 | }, 21 | "all": { 22 | "extractive_match": 0.575, 23 | "extractive_match_stderr": 0.07915823166939519 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "9dd35889080f8eca" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "812448b1bfa8fbc7" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-14-11.184777.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2562859.065263796, 9 | "end_time": 2563362.871206422, 10 | "total_evaluation_time_secondes": "503.8059426257387", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3333333333333333, 19 | "extractive_match_stderr": 0.08753762190648168 20 | }, 21 | "all": { 22 | "extractive_match": 0.3333333333333333, 23 | "extractive_match_stderr": 0.08753762190648168 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "57cd2595e3fb4e15" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "2ea2a7e8250f8005" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-36-02.705517.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2563429.852101547, 9 | "end_time": 2564674.316530785, 10 | "total_evaluation_time_secondes": "1244.4644292378798", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.82, 19 | "extractive_match_stderr": 0.01719859247631427 20 | }, 21 | "all": { 22 | "extractive_match": 0.82, 23 | "extractive_match_stderr": 0.01719859247631427 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "dda7db86b5891548" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "2ed97aefa363786c" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-44-26.970544.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2564788.62842531, 9 | "end_time": 2565178.658914464, 10 | "total_evaluation_time_secondes": "390.03048915416", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.675, 19 | "extractive_match_stderr": 0.07499999999999998 20 | }, 21 | "all": { 22 | "extractive_match": 0.675, 23 | "extractive_match_stderr": 0.07499999999999998 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "fdaf5a9fa19c5c25" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "a46bfd91934aff2c" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T02-52-04.589810.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2565227.071577089, 9 | "end_time": 2565636.278604793, 10 | "total_evaluation_time_secondes": "409.2070277039893", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.23333333333333334, 19 | "extractive_match_stderr": 0.07854032324531728 20 | }, 21 | "all": { 22 | "extractive_match": 0.23333333333333334, 23 | "extractive_match_stderr": 0.07854032324531728 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "17884cbea1d03333" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "887aa0bacc8f8844" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-11-26.478898.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2565700.44115151, 9 | "end_time": 2566798.118010773, 10 | "total_evaluation_time_secondes": "1097.676859262865", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.798, 19 | "extractive_match_stderr": 0.017973260031288265 20 | }, 21 | "all": { 22 | "extractive_match": 0.798, 23 | "extractive_match_stderr": 0.017973260031288265 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "6b327eeca1fea757" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "6dbad7c9862d4052" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp1_500/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-18-43.114223.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2566869.350252712, 9 | "end_time": 2567234.803742991, 10 | "total_evaluation_time_secondes": "365.4534902786836", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.675, 19 | "extractive_match_stderr": 0.07499999999999998 20 | }, 21 | "all": { 22 | "extractive_match": 0.675, 23 | "extractive_match_stderr": 0.07499999999999998 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "aeca4b45e767c732" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "e916b987c51016d8" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-13-19.506547.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2569877.59925465, 9 | "end_time": 2570511.193301663, 10 | "total_evaluation_time_secondes": "633.5940470127389", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3333333333333333, 19 | "extractive_match_stderr": 0.08753762190648169 20 | }, 21 | "all": { 22 | "extractive_match": 0.3333333333333333, 23 | "extractive_match_stderr": 0.08753762190648169 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "9d95073fa199fea1" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "9581b1a436f91769" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-35-14.751095.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2570571.236023537, 9 | "end_time": 2571826.342473873, 10 | "total_evaluation_time_secondes": "1255.106450336054", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.856, 19 | "extractive_match_stderr": 0.015716934945725767 20 | }, 21 | "all": { 22 | "extractive_match": 0.856, 23 | "extractive_match_stderr": 0.015716934945725767 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "ce035e0da82bf62c" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "185044a2d01d14ca" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-44-39.651229.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2571904.690897501, 9 | "end_time": 2572391.342055246, 10 | "total_evaluation_time_secondes": "486.6511577446945", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "806ab92dedff279f" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "a0e7c914c07ced33" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-54-03.406254.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2572478.041585189, 9 | "end_time": 2572955.095596673, 10 | "total_evaluation_time_secondes": "477.0540114841424", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.36666666666666664, 19 | "extractive_match_stderr": 0.0894855453983996 20 | }, 21 | "all": { 22 | "extractive_match": 0.36666666666666664, 23 | "extractive_match_stderr": 0.0894855453983996 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "39278bd920db3487" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "b9f7ed706698f186" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-13-57.844846.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2573032.477990599, 9 | "end_time": 2574149.485158068, 10 | "total_evaluation_time_secondes": "1117.0071674692445", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.854, 19 | "extractive_match_stderr": 0.01580720517583485 20 | }, 21 | "all": { 22 | "extractive_match": 0.854, 23 | "extractive_match_stderr": 0.01580720517583485 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "0e843829e85cfe94" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "db7b20002a364074" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-20-07.618719.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2574199.530597124, 9 | "end_time": 2574519.310388266, 10 | "total_evaluation_time_secondes": "319.77979114232585", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.75, 19 | "extractive_match_stderr": 0.06933752452815363 20 | }, 21 | "all": { 22 | "extractive_match": 0.75, 23 | "extractive_match_stderr": 0.06933752452815363 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "6a096939eeb8097d" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "bdec73a60303c3fc" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-31-02.871828.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2574556.171576032, 9 | "end_time": 2575174.544545133, 10 | "total_evaluation_time_secondes": "618.3729691011831", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.2, 19 | "extractive_match_stderr": 0.07427813527082075 20 | }, 21 | "all": { 22 | "extractive_match": 0.2, 23 | "extractive_match_stderr": 0.07427813527082075 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "6077ed83d15d6889" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "c1a9ed1d84a0d925" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T05-51-04.119358.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2575244.022829783, 9 | "end_time": 2576375.754693394, 10 | "total_evaluation_time_secondes": "1131.7318636109121", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.8, 19 | "extractive_match_stderr": 0.017906459241433855 20 | }, 21 | "all": { 22 | "extractive_match": 0.8, 23 | "extractive_match_stderr": 0.017906459241433855 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "758890b1dfee93f1" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "6da72493ad7e20fa" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-00-23.069698.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2576450.971500249, 9 | "end_time": 2576934.750199688, 10 | "total_evaluation_time_secondes": "483.77869943901896", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.575, 19 | "extractive_match_stderr": 0.07915823166939519 20 | }, 21 | "all": { 22 | "extractive_match": 0.575, 23 | "extractive_match_stderr": 0.07915823166939519 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "56e8f52dea2b9ec6" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "6f3eceda2cec8b33" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-09-49.021006.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2576972.829442079, 9 | "end_time": 2577500.710391771, 10 | "total_evaluation_time_secondes": "527.8809496918693", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "9dea059f367c048a" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "e716ced2d7d9fa84" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-28-55.702973.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2577587.191756647, 9 | "end_time": 2578647.345242229, 10 | "total_evaluation_time_secondes": "1060.1534855817445", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.826, 19 | "extractive_match_stderr": 0.016971271257516147 20 | }, 21 | "all": { 22 | "extractive_match": 0.826, 23 | "extractive_match_stderr": 0.016971271257516147 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "7b5df8b0938240b1" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "e53039268588162c" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-34-45.013493.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2578699.013531457, 9 | "end_time": 2578996.706382657, 10 | "total_evaluation_time_secondes": "297.6928511997685", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.65, 19 | "extractive_match_stderr": 0.07637626158259733 20 | }, 21 | "all": { 22 | "extractive_match": 0.65, 23 | "extractive_match_stderr": 0.07637626158259733 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "0a77fd8b65d5693c" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "a169c269d2eaaac4" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-42-52.541003.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2579092.107139461, 9 | "end_time": 2579484.220244874, 10 | "total_evaluation_time_secondes": "392.1131054125726", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.2, 19 | "extractive_match_stderr": 0.07427813527082076 20 | }, 21 | "all": { 22 | "extractive_match": 0.2, 23 | "extractive_match_stderr": 0.07427813527082076 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "8a299fcdd85d83a4" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "d5533616a87b80ed" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T06-56-52.023106.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2579564.124064897, 9 | "end_time": 2580323.670382602, 10 | "total_evaluation_time_secondes": "759.5463177049533", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.824, 19 | "extractive_match_stderr": 0.017047852020622263 20 | }, 21 | "all": { 22 | "extractive_match": 0.824, 23 | "extractive_match_stderr": 0.017047852020622263 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "b2d7391eac1f81f2" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "8e0031f22a0aa723" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-04-05.239254.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2580424.328730572, 9 | "end_time": 2580756.931524183, 10 | "total_evaluation_time_secondes": "332.60279361112043", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.675, 19 | "extractive_match_stderr": 0.07499999999999998 20 | }, 21 | "all": { 22 | "extractive_match": 0.675, 23 | "extractive_match_stderr": 0.07499999999999998 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "b38129b60c769bd6" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "4a03618c47084d90" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-10-27.028672.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2580812.042311833, 9 | "end_time": 2581138.714750407, 10 | "total_evaluation_time_secondes": "326.67243857402354", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3333333333333333, 19 | "extractive_match_stderr": 0.0875376219064817 20 | }, 21 | "all": { 22 | "extractive_match": 0.3333333333333333, 23 | "extractive_match_stderr": 0.0875376219064817 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "314d40cde0446e00" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "83c2fe7147a45ed0" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-24-32.807123.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2581209.555826082, 9 | "end_time": 2581984.464070361, 10 | "total_evaluation_time_secondes": "774.9082442792132", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.814, 19 | "extractive_match_stderr": 0.01741880678058394 20 | }, 21 | "all": { 22 | "extractive_match": 0.814, 23 | "extractive_match_stderr": 0.01741880678058394 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "a050975778a6be25" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "20a4136b5748e4f1" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-29-47.478300.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2582057.657695042, 9 | "end_time": 2582299.170243019, 10 | "total_evaluation_time_secondes": "241.51254797680303", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.625, 19 | "extractive_match_stderr": 0.07752170911825529 20 | }, 21 | "all": { 22 | "extractive_match": 0.625, 23 | "extractive_match_stderr": 0.07752170911825529 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "02bf8270f672d019" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "9bcd38a007e3c1f7" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-36-30.994685.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2582347.378834666, 9 | "end_time": 2582702.685794419, 10 | "total_evaluation_time_secondes": "355.30695975292474", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.26666666666666666, 19 | "extractive_match_stderr": 0.0821175682735253 20 | }, 21 | "all": { 22 | "extractive_match": 0.26666666666666666, 23 | "extractive_match_stderr": 0.0821175682735253 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "77b972bf9c1cfb26" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "585ab41a6bd09268" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-49-32.062451.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2582781.263596611, 9 | "end_time": 2583483.690574673, 10 | "total_evaluation_time_secondes": "702.426978061907", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.822, 19 | "extractive_match_stderr": 0.017123622189062257 20 | }, 21 | "all": { 22 | "extractive_match": 0.822, 23 | "extractive_match_stderr": 0.017123622189062257 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "36524c3d10393f7f" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "e4920254817e7bfc" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T07-55-51.416032.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2583567.839449735, 9 | "end_time": 2583863.106798447, 10 | "total_evaluation_time_secondes": "295.2673487123102", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "f52cd39075e5792e" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "17e169df3b2c15f2" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-30-59.655917.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2567298.375852412, 9 | "end_time": 2567971.3367268, 10 | "total_evaluation_time_secondes": "672.9608743879944", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "76473421bbe93410" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "a2125e595bab4673" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T03-53-53.011460.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2568037.999388426, 9 | "end_time": 2569344.647350468, 10 | "total_evaluation_time_secondes": "1306.6479620421305", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.854, 19 | "extractive_match_stderr": 0.01580720517583485 20 | }, 21 | "all": { 22 | "extractive_match": 0.854, 23 | "extractive_match_stderr": 0.01580720517583485 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "961808827104c739" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "0a4348ce590f191d" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T04-01-54.627135.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2569429.209718564, 9 | "end_time": 2569826.319747058, 10 | "total_evaluation_time_secondes": "397.11002849414945", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.8, 19 | "extractive_match_stderr": 0.06405126152203487 20 | }, 21 | "all": { 22 | "extractive_match": 0.8, 23 | "extractive_match_stderr": 0.06405126152203487 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "d1d854bc50c70679" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "198d7b3d60a17a97" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-52-22.004802.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2586637.696856398, 9 | "end_time": 2587253.686011266, 10 | "total_evaluation_time_secondes": "615.9891548678279", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.08509629433967632 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.08509629433967632 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "f899e2531ad5b6e1" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "a424417871d83f65" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-16-52.530386.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2587310.198941251, 9 | "end_time": 2588724.15183972, 10 | "total_evaluation_time_secondes": "1413.952898469288", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.844, 19 | "extractive_match_stderr": 0.016243636028391104 20 | }, 21 | "all": { 22 | "extractive_match": 0.844, 23 | "extractive_match_stderr": 0.016243636028391104 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "0ff22df8a2e32ff6" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "13d8e0072ee283ae" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-26-36.766492.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2588822.652426989, 9 | "end_time": 2589308.453780031, 10 | "total_evaluation_time_secondes": "485.80135304201394", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "cff1c1f58a10c3ec" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "9f14f44a8006a380" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-36-59.825027.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2589368.024663066, 9 | "end_time": 2589931.508379323, 10 | "total_evaluation_time_secondes": "563.4837162569165", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.43333333333333335, 19 | "extractive_match_stderr": 0.0920186554465537 20 | }, 21 | "all": { 22 | "extractive_match": 0.43333333333333335, 23 | "extractive_match_stderr": 0.0920186554465537 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "5f98c2f013b48f03" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "ed47f783736df3bf" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T09-55-14.957560.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2589983.778860533, 9 | "end_time": 2591026.655002043, 10 | "total_evaluation_time_secondes": "1042.8761415099725", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.856, 19 | "extractive_match_stderr": 0.015716934945725767 20 | }, 21 | "all": { 22 | "extractive_match": 0.856, 23 | "extractive_match_stderr": 0.015716934945725767 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "0fab9c626e9e9390" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "a22503828d7d43e3" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_150/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-04-33.528650.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2591099.695949811, 9 | "end_time": 2591585.220585029, 10 | "total_evaluation_time_secondes": "485.52463521808386", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.75, 19 | "extractive_match_stderr": 0.06933752452815363 20 | }, 21 | "all": { 22 | "extractive_match": 0.75, 23 | "extractive_match_stderr": 0.06933752452815363 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "1b6a1b4002eec6c6" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "21b4cbba6be064a5" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-33-42.826955.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2592299.508424689, 9 | "end_time": 2593334.475003944, 10 | "total_evaluation_time_secondes": "1034.9665792547166", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.852, 19 | "extractive_match_stderr": 0.015896458561251246 20 | }, 21 | "all": { 22 | "extractive_match": 0.852, 23 | "extractive_match_stderr": 0.015896458561251246 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "e3e867c652ccb694" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "d6ee582ea943bd6e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_200/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-39-54.446838.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2593401.704100977, 9 | "end_time": 2593706.140676645, 10 | "total_evaluation_time_secondes": "304.43657566793263", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.775, 19 | "extractive_match_stderr": 0.06686668711812967 20 | }, 21 | "all": { 22 | "extractive_match": 0.775, 23 | "extractive_match_stderr": 0.06686668711812967 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "e1ea54b6a1bf95b4" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "a71856567a33efac" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T10-50-49.693635.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2593753.251117188, 9 | "end_time": 2594361.376800723, 10 | "total_evaluation_time_secondes": "608.1256835348904", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3333333333333333, 19 | "extractive_match_stderr": 0.08753762190648169 20 | }, 21 | "all": { 22 | "extractive_match": 0.3333333333333333, 23 | "extractive_match_stderr": 0.08753762190648169 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "490dc46f67c29942" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "18c8551d8a1015a6" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-11-55.345666.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2594416.381958949, 9 | "end_time": 2595626.985790251, 10 | "total_evaluation_time_secondes": "1210.603831301909", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.804, 19 | "extractive_match_stderr": 0.017770751227744862 20 | }, 21 | "all": { 22 | "extractive_match": 0.804, 23 | "extractive_match_stderr": 0.017770751227744862 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "1678101c78f1f285" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "9f08e5d1c86fbf9c" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_250/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-22-45.088452.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2595694.940795575, 9 | "end_time": 2596276.773000479, 10 | "total_evaluation_time_secondes": "581.8322049044073", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.6, 19 | "extractive_match_stderr": 0.07844645405527362 20 | }, 21 | "all": { 22 | "extractive_match": 0.6, 23 | "extractive_match_stderr": 0.07844645405527362 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "597b0bb4fc1adfd9" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "66cde9f29d4c2f5c" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-33-37.815028.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2596337.815541219, 9 | "end_time": 2596929.497024281, 10 | "total_evaluation_time_secondes": "591.6814830619842", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.26666666666666666, 19 | "extractive_match_stderr": 0.0821175682735253 20 | }, 21 | "all": { 22 | "extractive_match": 0.26666666666666666, 23 | "extractive_match_stderr": 0.0821175682735253 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "d893bd40128f3f01" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "304e7ec495522d6e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T11-56-26.434504.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2596993.780300828, 9 | "end_time": 2598298.12918594, 10 | "total_evaluation_time_secondes": "1304.3488851119764", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.83, 19 | "extractive_match_stderr": 0.016815633531393415 20 | }, 21 | "all": { 22 | "extractive_match": 0.83, 23 | "extractive_match_stderr": 0.016815633531393415 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "2d8a8b96257009fe" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "69e17b0bd9a55619" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_300/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-07-33.637126.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2598411.082974548, 9 | "end_time": 2598965.327884916, 10 | "total_evaluation_time_secondes": "554.2449103682302", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "40d9bb73f755c8ae" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "0f0dc5923fa3d806" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-45-32.618178.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2599724.580276648, 9 | "end_time": 2601244.233675632, 10 | "total_evaluation_time_secondes": "1519.6533989841118", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.814, 19 | "extractive_match_stderr": 0.017418806780583957 20 | }, 21 | "all": { 22 | "extractive_match": 0.814, 23 | "extractive_match_stderr": 0.017418806780583957 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "966cfae3386821e2" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "06e0fffe6fd58548" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_350/results/quyanh/OpenRS-GRPO/results_2025-03-15T12-54-08.288591.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2601346.196358103, 9 | "end_time": 2601759.980020389, 10 | "total_evaluation_time_secondes": "413.7836622861214", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "3d0089b1cb6faa2d" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "adf30c7654ed9c16" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-02-26.672622.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2601826.367747913, 9 | "end_time": 2602258.347004526, 10 | "total_evaluation_time_secondes": "431.9792566127144", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "699ac3b1b5301ac1" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "0245497f2bebbcc5" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-24-00.237541.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2602298.434195373, 9 | "end_time": 2603551.813672935, 10 | "total_evaluation_time_secondes": "1253.3794775619172", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.824, 19 | "extractive_match_stderr": 0.01704785202062228 20 | }, 21 | "all": { 22 | "extractive_match": 0.824, 23 | "extractive_match_stderr": 0.01704785202062228 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "17b24f3c520b6310" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "fcb10a4340d55cf4" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_400/results/quyanh/OpenRS-GRPO/results_2025-03-15T13-32-38.296980.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2603652.439867107, 9 | "end_time": 2604069.984818537, 10 | "total_evaluation_time_secondes": "417.5449514295906", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.675, 19 | "extractive_match_stderr": 0.07499999999999998 20 | }, 21 | "all": { 22 | "extractive_match": 0.675, 23 | "extractive_match_stderr": 0.07499999999999998 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "3e0fd04e750b02c0" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "02c3a8d49a2e3ee3" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-07-05.491769.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2583924.187252476, 9 | "end_time": 2584537.177876145, 10 | "total_evaluation_time_secondes": "612.9906236692332", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.4666666666666667, 19 | "extractive_match_stderr": 0.09264111117062017 20 | }, 21 | "all": { 22 | "extractive_match": 0.4666666666666667, 23 | "extractive_match_stderr": 0.09264111117062017 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "fd81113ccd505a4e" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "2a36a200d6cd2247" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-31-16.022306.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2584606.174923473, 9 | "end_time": 2585987.719419564, 10 | "total_evaluation_time_secondes": "1381.5444960910827", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.844, 19 | "extractive_match_stderr": 0.016243636028391097 20 | }, 21 | "all": { 22 | "extractive_match": 0.844, 23 | "extractive_match_stderr": 0.016243636028391097 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "6f3bc11d908efbf4" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "c2619f7c35dda61a" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/evals/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T08-41-29.644364.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 2586061.970029914, 9 | "end_time": 2586601.331146573, 10 | "total_evaluation_time_secondes": "539.3611166593619", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.725, 19 | "extractive_match_stderr": 0.0714995069016527 20 | }, 21 | "all": { 22 | "extractive_match": 0.725, 23 | "extractive_match_stderr": 0.0714995069016527 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "c0838ad8f27af064" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "68bf0a6b43ec34ae" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T18-04-10.796422.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9795607.890673576, 9 | "end_time": 9796266.243738612, 10 | "total_evaluation_time_secondes": "658.3530650362372", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "5141f36c632674a8" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "f8465f893370f47e" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T18-27-09.526559.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9796309.327618744, 9 | "end_time": 9797644.914942032, 10 | "total_evaluation_time_secondes": "1335.587323287502", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.838, 19 | "extractive_match_stderr": 0.016494123566423505 20 | }, 21 | "all": { 22 | "extractive_match": 0.838, 23 | "extractive_match_stderr": 0.016494123566423505 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "4e007e6ddae6bd47" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "fa9f7bbd65512b23" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp1_100/results/quyanh/OpenRS-GRPO/results_2025-03-15T18-37-46.790845.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9797696.616540272, 9 | "end_time": 9798282.237784388, 10 | "total_evaluation_time_secondes": "585.621244115755", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.7, 19 | "extractive_match_stderr": 0.07337993857053426 20 | }, 21 | "all": { 22 | "extractive_match": 0.7, 23 | "extractive_match_stderr": 0.07337993857053426 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "b589babd0b65f01d" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "605820c49e2b0f60" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T20-26-02.445125.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9804127.746607196, 9 | "end_time": 9804777.889406092, 10 | "total_evaluation_time_secondes": "650.142798896879", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|aime24|0": { 18 | "extractive_match": 0.3, 19 | "extractive_match_stderr": 0.0850962943396763 20 | }, 21 | "all": { 22 | "extractive_match": 0.3, 23 | "extractive_match_stderr": 0.0850962943396763 24 | } 25 | }, 26 | "versions": { 27 | "custom|aime24|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|aime24": { 31 | "name": "aime24", 32 | "prompt_function": "aime_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/aime_2024", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 30, 64 | "effective_num_docs": 30, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|aime24|0": { 71 | "hashes": { 72 | "hash_examples": "ddec8fc79d0a014b", 73 | "hash_full_prompts": "253167becf0dfed7", 74 | "hash_input_tokens": "bf1cc75b5f12dfb8", 75 | "hash_cont_tokens": "76473421bbe93410" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 30, 79 | "padded": 0, 80 | "non_padded": 30, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "c903e836a519cf98", 88 | "hash_full_prompts": "84ff409b6bbf7cc0", 89 | "hash_input_tokens": "9a8c7e54ce09af84", 90 | "hash_cont_tokens": "a2125e595bab4673" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 30, 94 | "padded": 0, 95 | "non_padded": 30, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T20-48-43.218853.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9804830.717740912, 9 | "end_time": 9806138.61057854, 10 | "total_evaluation_time_secondes": "1307.8928376287222", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.854, 19 | "extractive_match_stderr": 0.01580720517583485 20 | }, 21 | "all": { 22 | "extractive_match": 0.854, 23 | "extractive_match_stderr": 0.01580720517583485 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "961808827104c739" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "0a4348ce590f191d" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp2_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T20-56-24.803840.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9806195.692178983, 9 | "end_time": 9806600.257096833, 10 | "total_evaluation_time_secondes": "404.5649178493768", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.8, 19 | "extractive_match_stderr": 0.06405126152203487 20 | }, 21 | "all": { 22 | "extractive_match": 0.8, 23 | "extractive_match_stderr": 0.06405126152203487 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "d1d854bc50c70679" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "198d7b3d60a17a97" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T23-02-19.308882.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9812781.11633004, 9 | "end_time": 9814154.770996448, 10 | "total_evaluation_time_secondes": "1373.6546664070338", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|math_500|0": { 18 | "extractive_match": 0.844, 19 | "extractive_match_stderr": 0.016243636028391097 20 | }, 21 | "all": { 22 | "extractive_match": 0.844, 23 | "extractive_match_stderr": 0.016243636028391097 24 | } 25 | }, 26 | "versions": { 27 | "custom|math_500|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|math_500": { 31 | "name": "math_500", 32 | "prompt_function": "math_prompt_fn", 33 | "hf_repo": "HuggingFaceH4/MATH-500", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "test" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "test" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 500, 64 | "effective_num_docs": 500, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|math_500|0": { 71 | "hashes": { 72 | "hash_examples": "adf0cc8311011db2", 73 | "hash_full_prompts": "63c902dbdbaf1552", 74 | "hash_input_tokens": "2af397a095a31139", 75 | "hash_cont_tokens": "6f3bc11d908efbf4" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 500, 79 | "padded": 0, 80 | "non_padded": 500, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "bfaad1993ff37a60", 88 | "hash_full_prompts": "3ceaaade5cf43911", 89 | "hash_input_tokens": "c663dbac8a64d3e4", 90 | "hash_cont_tokens": "c2619f7c35dda61a" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 500, 94 | "padded": 0, 95 | "non_padded": 500, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /logs/overall/Exp3_50/results/quyanh/OpenRS-GRPO/results_2025-03-15T23-12-28.069489.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_general": { 3 | "lighteval_sha": "?", 4 | "num_fewshot_seeds": 1, 5 | "override_batch_size": -1, 6 | "max_samples": null, 7 | "job_id": 0, 8 | "start_time": 9814218.648552556, 9 | "end_time": 9814763.480200425, 10 | "total_evaluation_time_secondes": "544.8316478691995", 11 | "model_name": "quyanh/OpenRS-GRPO", 12 | "model_sha": "", 13 | "model_dtype": null, 14 | "model_size": null 15 | }, 16 | "results": { 17 | "custom|amc23|0": { 18 | "extractive_match": 0.725, 19 | "extractive_match_stderr": 0.0714995069016527 20 | }, 21 | "all": { 22 | "extractive_match": 0.725, 23 | "extractive_match_stderr": 0.0714995069016527 24 | } 25 | }, 26 | "versions": { 27 | "custom|amc23|0": 1 28 | }, 29 | "config_tasks": { 30 | "custom|amc23": { 31 | "name": "amc23", 32 | "prompt_function": "amc_prompt_fn", 33 | "hf_repo": "knoveleng/AMC-23", 34 | "hf_subset": "default", 35 | "metric": [ 36 | { 37 | "metric_name": "extractive_match", 38 | "higher_is_better": true, 39 | "category": "3", 40 | "use_case": "1", 41 | "sample_level_fn": "sample_level_fn", 42 | "corpus_level_fn": "mean" 43 | } 44 | ], 45 | "hf_revision": null, 46 | "hf_filter": null, 47 | "hf_avail_splits": [ 48 | "train" 49 | ], 50 | "trust_dataset": false, 51 | "evaluation_splits": [ 52 | "train" 53 | ], 54 | "few_shots_split": null, 55 | "few_shots_select": null, 56 | "generation_size": 32768, 57 | "generation_grammar": null, 58 | "stop_sequence": [], 59 | "num_samples": null, 60 | "suite": [ 61 | "custom" 62 | ], 63 | "original_num_docs": 40, 64 | "effective_num_docs": 40, 65 | "must_remove_duplicate_docs": false, 66 | "version": 1 67 | } 68 | }, 69 | "summary_tasks": { 70 | "custom|amc23|0": { 71 | "hashes": { 72 | "hash_examples": "57f3ead69f601b6a", 73 | "hash_full_prompts": "64c05856286ef8dc", 74 | "hash_input_tokens": "b64afe4485ef61c0", 75 | "hash_cont_tokens": "c0838ad8f27af064" 76 | }, 77 | "truncated": 0, 78 | "non_truncated": 40, 79 | "padded": 0, 80 | "non_padded": 40, 81 | "effective_few_shots": 0.0, 82 | "num_truncated_few_shots": 0 83 | } 84 | }, 85 | "summary_general": { 86 | "hashes": { 87 | "hash_examples": "fe1f1f4512256bec", 88 | "hash_full_prompts": "882107366cadc5ce", 89 | "hash_input_tokens": "638cd1079164f374", 90 | "hash_cont_tokens": "68bf0a6b43ec34ae" 91 | }, 92 | "truncated": 0, 93 | "non_truncated": 40, 94 | "padded": 0, 95 | "non_padded": 40, 96 | "num_truncated_few_shots": 0 97 | } 98 | } -------------------------------------------------------------------------------- /recipes/accelerate_configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /recipes/accelerate_configs/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610 8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 9 | fsdp_backward_prefetch: BACKWARD_PRE 10 | fsdp_cpu_ram_efficient_loading: true 11 | fsdp_forward_prefetch: true 12 | fsdp_offload_params: false 13 | fsdp_sharding_strategy: FULL_SHARD 14 | fsdp_state_dict_type: FULL_STATE_DICT 15 | fsdp_sync_module_states: true 16 | fsdp_use_orig_params: true 17 | machine_rank: 0 18 | main_training_function: main 19 | mixed_precision: bf16 20 | num_machines: 1 21 | num_processes: 8 22 | rdzv_backend: static 23 | same_network: true 24 | tpu_env: [] 25 | tpu_use_cluster: false 26 | tpu_use_sudo: false 27 | use_cpu: false -------------------------------------------------------------------------------- /recipes/accelerate_configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /recipes/accelerate_configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /recipes/data_cleaner.yaml: -------------------------------------------------------------------------------- 1 | model_kwargs: 2 | model: Qwen/Qwen2.5-Math-7B-Instruct 3 | trust_remote_code: true 4 | max_model_len: 4096 5 | gpu_memory_utilization: 0.9 6 | enforce_eager: true 7 | tensor_parallel_size: 4 8 | 9 | sampling_params: 10 | temperature: 0.7 11 | top_p: 0.9 12 | max_tokens: 4096 13 | -------------------------------------------------------------------------------- /recipes/grpo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | dataset_name: knoveleng/open-rs 9 | system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here . Note that respond by English, NOT use other languages." 10 | 11 | # GRPO trainer config 12 | bf16: true 13 | use_vllm: true 14 | vllm_device: auto 15 | vllm_enforce_eager: true 16 | vllm_gpu_memory_utilization: 0.7 17 | vllm_max_model_len: 4608 18 | do_eval: false 19 | gradient_accumulation_steps: 4 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | hub_model_id: OpenRS-GRPO 24 | hub_strategy: every_save 25 | learning_rate: 1.0e-06 26 | log_completions: true 27 | log_level: info 28 | logging_first_step: true 29 | logging_steps: 1 30 | logging_strategy: steps 31 | lr_scheduler_type: cosine_with_min_lr 32 | lr_scheduler_kwargs: 33 | min_lr_rate: 0.1 34 | max_prompt_length: 512 35 | max_completion_length: 3584 36 | max_steps: 500 37 | num_generations: 6 38 | num_train_epochs: 1 39 | output_dir: data/OpenRS-GRPO 40 | overwrite_output_dir: true 41 | per_device_eval_batch_size: 6 42 | per_device_train_batch_size: 6 43 | push_to_hub: true 44 | report_to: 45 | - wandb 46 | reward_funcs: 47 | - format 48 | - cosine 49 | reward_weights: 50 | - 1.0 51 | - 2.0 52 | save_strategy: "steps" 53 | save_steps: 50 54 | seed: 42 55 | temperature: 0.7 56 | warmup_ratio: 0.1 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /src/open_r1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /src/open_r1/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .import_utils import is_e2b_available 2 | from .model_utils import get_tokenizer 3 | 4 | 5 | __all__ = ["get_tokenizer", "is_e2b_available"] 6 | -------------------------------------------------------------------------------- /src/open_r1/utils/import_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers.utils.import_utils import _is_package_available 16 | 17 | 18 | # Use same as transformers.utils.import_utils 19 | _e2b_available = _is_package_available("e2b") 20 | 21 | 22 | def is_e2b_available() -> bool: 23 | return _e2b_available 24 | -------------------------------------------------------------------------------- /src/open_r1/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, PreTrainedTokenizer 2 | 3 | from trl import ModelConfig 4 | 5 | from ..configs import GRPOConfig, SFTConfig 6 | 7 | 8 | DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" 9 | 10 | 11 | def get_tokenizer( 12 | model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True 13 | ) -> PreTrainedTokenizer: 14 | """Get the tokenizer for the model.""" 15 | tokenizer = AutoTokenizer.from_pretrained( 16 | model_args.model_name_or_path, 17 | revision=model_args.model_revision, 18 | trust_remote_code=model_args.trust_remote_code, 19 | ) 20 | 21 | if training_args.chat_template is not None: 22 | tokenizer.chat_template = training_args.chat_template 23 | elif auto_set_chat_template and tokenizer.get_chat_template() is None: 24 | tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE 25 | 26 | return tokenizer 27 | -------------------------------------------------------------------------------- /src/open_r1/utils/wandb_logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def init_wandb_training(training_args): 5 | """ 6 | Helper function for setting up Weights & Biases logging tools. 7 | """ 8 | if training_args.wandb_entity is not None: 9 | os.environ["WANDB_ENTITY"] = training_args.wandb_entity 10 | if training_args.wandb_project is not None: 11 | os.environ["WANDB_PROJECT"] = training_args.wandb_project 12 | --------------------------------------------------------------------------------