├── README.md ├── configs ├── zero1.yaml ├── zero1_offload.yaml ├── zero2.yaml └── zero2_offload.yaml ├── data ├── game24 │ ├── 24.csv │ ├── model_generation │ │ ├── mid │ │ │ └── responses_n300_llama7b-2-ep2_samples.jsonl │ │ └── train │ │ │ ├── responses_n100_llama7b-2-ep2_process_samples.jsonl │ │ │ └── responses_n100_llama7b-2-ep2_samples.jsonl │ └── train.jsonl └── gsm8k │ ├── model_generation │ ├── test │ │ ├── responses_n300_mistral7b-ep2_samples.jsonl │ │ └── responses_n500_llama7b-2-ep2_samples.jsonl │ └── train │ │ ├── responses_n100_llama7b-2-ep2_samples.jsonl │ │ └── responses_n100_mistral7b-ep2_samples.jsonl │ ├── test.jsonl │ └── train.jsonl ├── eval_generator_by_step.py ├── eval_results ├── game24 │ ├── generator │ │ └── mid │ │ │ ├── metrics_llama7b-2-ep2.json │ │ │ └── responses_llama7b-2-ep2_samples.jsonl │ ├── generator_with_verifier │ │ └── mid │ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10 │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml20.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml20_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml20_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml25.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml25_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml25_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml50.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml50_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml50_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml25.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml25_41.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml25_43.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml20.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml20_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml20_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml40.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml40_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml40_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1 │ │ │ ├── metrics_%step20_llama7b-2-ep2-all_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2-all_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ │ │ └── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8 │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json │ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv │ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl │ └── verifier │ │ └── mid │ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1)_g(llama7b-2-ep2).json │ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10)_g(llama7b-2-ep2).json │ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2)_g(llama7b-2-ep2).json │ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4)_g(llama7b-2-ep2).json │ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6)_g(llama7b-2-ep2).json │ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8)_g(llama7b-2-ep2).json │ │ └── responses_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10)_g(llama7b-2-ep2)_samples.jsonl └── gsm8k │ ├── generator │ ├── test │ │ ├── all_metrics_n100_llama7b-2-ep2.csv │ │ └── all_metrics_n300_mistral7b-ep2.csv │ └── train │ │ ├── all_metrics_n100_llama7b-2-ep2.csv │ │ └── all_metrics_n100_mistral7b-ep2.csv │ ├── generator_with_verifier │ ├── test │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam1.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam10.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam10_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam10_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam1_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam1_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam2.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam20.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam20_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam20_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam25.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam25_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam25_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam2_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam2_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam4.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam4_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam4_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam5.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam50.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam50_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam50_43.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam5_41.json │ │ │ ├── metrics_%step100_llama7b-2-ep2_beam5_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam1.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam10.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam10_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam10_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam1_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam1_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam2.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam2_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam2_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam4.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam4_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam4_43.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam5.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam5_41.json │ │ │ ├── metrics_%step20_llama7b-2-ep2_beam5_43.json │ │ │ ├── responses_%step100_llama7b-2-ep2_beam50_samples.jsonl │ │ │ └── responses_%step100_llama7b-2-ep2_beam5_samples.jsonl │ │ └── mistral7b-ep2-n100-scahead-mse-lm-token │ │ │ ├── metrics_%step100_mistral7b-ep2_beam1.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam10.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam10_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam10_43.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam1_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam1_43.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam2.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam20.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam20_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam20_43.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam25.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam25_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam25_43.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam2_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam2_43.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam5.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam50.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam50_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam50_43.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam5_41.json │ │ │ ├── metrics_%step100_mistral7b-ep2_beam5_43.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam1.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam10.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam10_41.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam10_43.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam1_41.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam1_43.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam2.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam2_41.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam2_43.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam4.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam4_41.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam4_43.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam5.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam5_41.json │ │ │ ├── metrics_%step20_mistral7b-ep2_beam5_43.json │ │ │ ├── metrics_g(mistral7b-ep2)_v(mistral7b-ep2-n100-scahead-mse-lm-token).csv │ │ │ ├── responses_%step100_mistral7b-ep2_beam50_samples.jsonl │ │ │ └── responses_%step100_mistral7b-ep2_beam5_samples.jsonl │ └── train │ │ └── llama7b-2-ep2-n100-scahead-mse-lm-token │ │ └── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token).csv │ └── verifier │ ├── test │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json │ ├── metrics_v(mistral7b-ep2-n100-scahead-mse-lm-token)_g(mistral7b-ep2).json │ ├── responses_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2)_samples.jsonl │ └── responses_v(mistral7b-ep2-n100-scahead-mse-lm-token)_g(mistral7b-ep2)_samples.jsonl │ └── train │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json │ └── responses_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2)_samples.jsonl ├── eval_with_verifier.py ├── generate_paths_and_eval.py ├── requirements.txt ├── scripts ├── game24 │ ├── eval_step_beam.sh │ ├── eval_with_verifier.sh │ ├── generate.sh │ ├── greedy_eval.sh │ ├── train_generator.sh │ ├── train_verifier.sh │ └── train_verifier_process.sh └── gsm8k │ ├── eval_step_beam.sh │ ├── eval_with_verifier.sh │ ├── generate.sh │ ├── greedy_eval.sh │ ├── train_generator.sh │ └── train_verifier.sh ├── train_generator.py ├── train_verifier.py └── utils ├── cached_models.py ├── constants.py ├── datasets.py ├── flash_attn_monkey_patch.py ├── game24 ├── __init__.py ├── datasets.py ├── decoding.py └── metrics.py ├── gsm8k ├── __init__.py ├── datasets.py ├── decoding.py └── metrics.py ├── metrics.py ├── models.py ├── optim.py ├── sampling.py ├── states.py └── verifier_models.py /README.md: -------------------------------------------------------------------------------- 1 | # OVM, Outcome-supervised Value Models for Planning in Mathematical Reasoning 2 | 3 | 4 | Code, metrics, and models for the paper [Outcome-supervised Verifiers for Planning in Mathematical Reasoning](https://arxiv.org/pdf/2311.09724.pdf) 5 | 6 | 7 | The key technical implementations (`utils/sampling.py`): 8 | 9 | 1. **Value-guided beam search**: step-level beam search guided by a value model 10 | 11 | 2. **Allow batch generation with caculator using cache** (2-3 times faster than a naive implementation) 12 | 13 | 14 | ## Models and Data 15 | 16 | | Model | Dataset | Link | 17 | |----------------------|--------------|--------------------------| 18 | | OVM-Llama2-7B | GSM8K | [parameters](https://huggingface.co/FreedomIntelligence/OVM-llama2-7b) | 19 | | OVM-Mistral-7B | GSM8K | [parameters](https://huggingface.co/FreedomIntelligence/OVM-Mistral-7b) | 20 | 21 | See the training data of our value models (generated by the generators) in [dataset](https://huggingface.co/datasets/FreedomIntelligence/OVM-dataset) 22 | 23 | See the training data for Process Reward Models in GSM8K in [dataset](https://huggingface.co/datasets/FreedomIntelligence/OVM-process) 24 | 25 | ## Somethings for code 26 | 27 | 28 | 1. Directories 29 | - `configs`: for model training with `accelerate` 30 | - `data`: benchmark, and generator created data for training the value model 31 | - `eval_results`: metrics and responses 32 | - `generator`: generator-only (greedy, self-consistency, or pass@k) 33 | - `verifier`: ORM accuracy 34 | - `generator_with_verifier`: guided beam search, i.e. OVM and PRM 35 | - `scripts`: scripts for training and inference 36 | - `utils`: functions and classes 37 | 38 | 39 | 2. target_set 40 | - GSM8K: there are `train` and `test`, which corresponds to training set and test set respectively 41 | - Game of 24: there are `train` and `mid` 42 | - `train`: the first 900 problems 43 | - `mid`: problems index 901-1000 44 | 45 | 3. scripts for GSM8K and Game of 24 are similar. For simplicity, we only take GSM8K as the example below. You can simply run the same pipeline in Game of 24 by replacing `gsm8k` with `game24` 46 | 47 | 48 | 49 | ## Training 50 | 51 | ### Train the generator 52 | 53 | Training data for generator: 54 | - GSM8K: `data/gsm8k/train.jsonl`, from [OpenAI GSM8K](https://github.com/openai/grade-school-math/blob/master/grade_school_math/data/train.jsonl) 55 | - Game of 24: `data/game24/train.jsonl`, the first 900 problems in `data/game24/24.csv` (from [ToT](https://github.com/princeton-nlp/tree-of-thought-llm/blob/master/src/tot/data/24/24.csv)) with enumerated solutions 56 | 57 | To run the script `train_generator.sh` (under `scripts/gsm8k` or `scripts/game24`), you should first set `WANDB_API_KEY`, `WANDB_ENTITY`, `model_name_or_path`, `save_dir`. The generator is named by `save_generator_id` 58 | 59 | ```bash 60 | cd OVM 61 | bash scripts/gsm8k/train_generator.sh 62 | ``` 63 | 64 | 65 | ### Train the OVM 66 | 67 | #### Generation 68 | 69 | First use the generator `generator_id` to generate `n_solutions` for each question in the training set, 70 | ```bash 71 | cd OVM 72 | bash scripts/gsm8k/generate.sh 73 | ``` 74 | You should first config the path of your generator checkpoint `model_name_or_path`, and set `--target_set train` 75 | 76 | The output will be saved to `data/gsm8k/model_generation/` 77 | 78 | 79 | #### Training 80 | 81 | Train OVM using `train_verifier.sh`. First set `WANDB_API_KEY`, `WANDB_ENTITY`, `save_dir`, and `checkpoint_dir` (the path of generator checkpoint). The verifier is named with `save_verifier_id` 82 | ```bash 83 | cd OVM 84 | bash scripts/gsm8k/train_verifier.sh 85 | ``` 86 | 87 | 88 | 89 | ## Inference 90 | 91 | ### Value-Guided Beam Search 92 | 93 | Config your generator checkpoint path `model_name_or_path` and verifier checkpoint path `verifier_model_name_or_path` in `eval_step_beam.sh` 94 | ```bash 95 | cd OVM 96 | bash scripts/gsm8k/eval_step_beam.sh 97 | ``` 98 | 99 | (when `dedup_mode=1`, it will prioritize linguistically different candidates, which means when the sorted candidates are ['a', 'a', 'b', 'b', 'c'] it will select ['a', 'b', 'c'] rather than ['a', 'a', 'b'] if n_beam=3) 100 | 101 | The output will be saved to `eval_results/gsm8k/generator_with_verifier/test` 102 | (or `eval_results/game24/generator_with_verifier/mid`) 103 | 104 | 105 | ### Vanilla Sampling with ORM 106 | 107 | 1. First sample the data: config the generator checkpoint `model_name_or_path`, and set `--target_set test` 108 | ```bash 109 | cd OVM 110 | bash scripts/gsm8k/generate.sh 111 | ``` 112 | 113 | 2. Then call ORM to score and rerank the samples: config the verifier checkpoint `verifier_model_name_or_path` 114 | ```bash 115 | cd OVM 116 | bash scripts/gsm8k/eval_with_verifier.sh 117 | ``` 118 | 119 | The output will be saved to `eval_results/gsm8k/generator_with_verifier/test` 120 | 121 | 122 | 123 | ### Greedy 124 | 125 | Config your generator checkpoint path `model_name_or_path` 126 | ```bash 127 | cd OVM 128 | bash scripts/gsm8k/greedy_eval.sh 129 | ``` 130 | The output will be saved to `eval_results/gsm8k/generator/test` 131 | 132 | 133 | 134 | 135 | 136 | ## Citation 137 | ``` 138 | @misc{yu2023outcomesupervised, 139 | title={Outcome-supervised Verifiers for Planning in Mathematical Reasoning}, 140 | author={Fei Yu and Anningzhe Gao and Benyou Wang}, 141 | year={2023}, 142 | eprint={2311.09724}, 143 | archivePrefix={arXiv}, 144 | primaryClass={cs.AI} 145 | } 146 | ``` 147 | 148 | ## Star History 149 | 150 | 151 | 152 | 153 | 154 | Star History Chart 155 | 156 | 157 | -------------------------------------------------------------------------------- /configs/zero1.yaml: -------------------------------------------------------------------------------- 1 | command_file: null 2 | commands: null 3 | compute_environment: LOCAL_MACHINE 4 | deepspeed_config: 5 | deepspeed_multinode_launcher: standard 6 | gradient_clipping: 1.0 7 | zero_stage: 1 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: true 10 | dynamo_backend: 'NO' 11 | gpu_ids: null 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: null 15 | main_training_function: main 16 | fsdp_config: {} 17 | megatron_lm_config: {} 18 | mixed_precision: bf16 19 | num_machines: 1 20 | num_processes: 8 21 | rdzv_backend: static 22 | same_network: true 23 | tpu_name: null 24 | tpu_zone: null 25 | use_cpu: false 26 | -------------------------------------------------------------------------------- /configs/zero1_offload.yaml: -------------------------------------------------------------------------------- 1 | command_file: null 2 | commands: null 3 | compute_environment: LOCAL_MACHINE 4 | deepspeed_config: 5 | deepspeed_multinode_launcher: standard 6 | gradient_clipping: 1.0 7 | zero_stage: 1 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: true 10 | dynamo_backend: 'NO' 11 | gpu_ids: null 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: null 15 | main_training_function: main 16 | fsdp_config: {} 17 | megatron_lm_config: {} 18 | mixed_precision: bf16 19 | num_machines: 1 20 | num_processes: 8 21 | rdzv_backend: static 22 | same_network: true 23 | tpu_name: null 24 | tpu_zone: null 25 | use_cpu: true 26 | -------------------------------------------------------------------------------- /configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | command_file: null 2 | commands: null 3 | compute_environment: LOCAL_MACHINE 4 | deepspeed_config: 5 | deepspeed_multinode_launcher: standard 6 | gradient_clipping: 1.0 7 | zero_stage: 2 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: true 10 | dynamo_backend: 'NO' 11 | gpu_ids: null 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: null 15 | main_training_function: main 16 | fsdp_config: {} 17 | megatron_lm_config: {} 18 | mixed_precision: bf16 19 | num_machines: 1 20 | num_processes: 8 21 | rdzv_backend: static 22 | same_network: true 23 | tpu_name: null 24 | tpu_zone: null 25 | use_cpu: false 26 | -------------------------------------------------------------------------------- /configs/zero2_offload.yaml: -------------------------------------------------------------------------------- 1 | command_file: null 2 | commands: null 3 | compute_environment: LOCAL_MACHINE 4 | deepspeed_config: 5 | deepspeed_multinode_launcher: standard 6 | gradient_clipping: 1.0 7 | offload_optimizer_device: cpu 8 | offload_param_device: cpu 9 | zero_stage: 2 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: true 12 | dynamo_backend: 'NO' 13 | gpu_ids: null 14 | machine_rank: 0 15 | main_process_ip: null 16 | main_process_port: null 17 | main_training_function: main 18 | fsdp_config: {} 19 | megatron_lm_config: {} 20 | mixed_precision: bf16 21 | num_machines: 1 22 | num_processes: 8 23 | rdzv_backend: static 24 | same_network: true 25 | tpu_name: null 26 | tpu_zone: null 27 | use_cpu: true 28 | -------------------------------------------------------------------------------- /eval_results/game24/generator/mid/metrics_llama7b-2-ep2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.11 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator/mid/responses_llama7b-2-ep2_samples.jsonl: -------------------------------------------------------------------------------- 1 | {"idx": 0, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 4 5 6 10\n", "question": "4 5 6 10", "response": "5 + 6 = 11 (left: 4 10 11)\n10 + 11 = 21 (left: 4 21)\n4 + 21 = 25 (left: 25)\nThe answer is 4 + (10 + (5 + 6)) = 25", "response_answer": "4 + (10 + (5 + 6))", "label": false} 2 | {"idx": 1, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 2 4 7\n", "question": "1 2 4 7", "response": "1 + 7 = 8 (left: 2 4 8)\n4 + 8 = 12 (left: 2 12)\n2 * 12 = 24 (left: 24)\nThe answer is 2 * (4 + (1 + 7)) = 24", "response_answer": "2 * (4 + (1 + 7))", "label": true} 3 | {"idx": 2, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 2 5 8 11\n", "question": "2 5 8 11", "response": "2 - 5 = -3 (left: 8 11 -3)\n8 + 11 = 19 (left: -3 19)\n19 - -3 = 22 (left: 22)\nThe answer is (8 + 11) - (2 - 5) = 22", "response_answer": "(8 + 11) - (2 - 5)", "label": false} 4 | {"idx": 3, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 3 4 4 13\n", "question": "3 4 4 13", "response": "4 - 3 = 1 (left: 4 13 1)\n13 - 1 = 12 (left: 4 12)\n4 * 12 = 48 (left: 48)\nThe answer is 4 * (13 - (4 - 3)) = 48", "response_answer": "4 * (13 - (4 - 3))", "label": false} 5 | {"idx": 4, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 6 7 8 9\n", "question": "6 7 8 9", "response": "6 - 7 = -1 (left: 8 9 -1)\n8 + -1 = 7 (left: 9 7)\n9 + 7 = 16 (left: 16)\nThe answer is 9 + (8 + (6 - 7)) = 16", "response_answer": "9 + (8 + (6 - 7))", "label": false} 6 | {"idx": 5, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 11 11 13\n", "question": "1 11 11 13", "response": "11 + 13 = 24 (left: 1 11 24)\n1 - 11 = -10 (left: 24 -10)\n24 - -10 = 34 (left: 34)\nThe answer is (11 + 13) - (1 - 11) = 34", "response_answer": "(11 + 13) - (1 - 11)", "label": false} 7 | {"idx": 6, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 8 10 11\n", "question": "1 8 10 11", "response": "10 - 1 = 9 (left: 8 11 9)\n11 + 9 = 20 (left: 8 20)\n8 + 20 = 28 (left: 28)\nThe answer is 8 + (11 + (10 - 1)) = 28", "response_answer": "8 + (11 + (10 - 1))", "label": false} 8 | {"idx": 7, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 2 3 6 9\n", "question": "2 3 6 9", "response": "2 * 9 = 18 (left: 3 6 18)\n3 + 18 = 21 (left: 6 21)\n6 + 21 = 27 (left: 27)\nThe answer is 6 + (3 + (2 * 9)) = 27", "response_answer": "6 + (3 + (2 * 9))", "label": false} 9 | {"idx": 8, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 3 5 9\n", "question": "1 3 5 9", "response": "1 * 5 = 5 (left: 3 9 5)\n9 - 5 = 4 (left: 3 4)\n3 * 4 = 12 (left: 12)\nThe answer is 3 * (9 - (1 * 5)) = 12", "response_answer": "3 * (9 - (1 * 5))", "label": false} 10 | {"idx": 9, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 3 3 7 12\n", "question": "3 3 7 12", "response": "3 + 7 = 10 (left: 3 12 10)\n12 + 10 = 22 (left: 3 22)\n3 + 22 = 25 (left: 25)\nThe answer is 3 + (12 + (3 + 7)) = 25", "response_answer": "3 + (12 + (3 + 7))", "label": false} -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.36 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.65 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.59 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.35 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.34 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.52 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.49 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.45 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.68 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.69 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.65 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.67 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.41 4 | 15,0.56 5 | 20,0.63 6 | 25,0.69 7 | 30,0.73 8 | 35,0.78 9 | 40,0.81 10 | 45,0.86 11 | 50,0.87 12 | 55,0.89 13 | 60,0.9 14 | 65,0.9 15 | 70,0.9 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.84 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.98 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.98 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.97 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.85 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.84 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml20.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml20_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml20_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.97 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml25.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.98 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml25_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 1.0 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml25_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.97 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.88 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.88 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.91 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.91 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.93 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml50.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.96 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml50_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml50_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.81 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.81 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.86 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.94 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.85 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.83 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.87 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml25.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.9 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml25_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.9 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml25_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.9 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.88 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.87 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.9 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.89 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.9 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.83 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.98 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.97 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.97 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.84 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.88 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.88 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml20.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.96 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml20_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.96 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml20_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.96 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.9 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.88 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml40.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml40_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml40_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.95 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.93 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.92 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.93 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.91 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.41 4 | 15,0.56 5 | 20,0.62 6 | 25,0.68 7 | 30,0.72 8 | 35,0.77 9 | 40,0.8 10 | 45,0.85 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.5 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.64 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.46 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.5 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.58 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.62 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.64 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.41 4 | 15,0.56 5 | 20,0.63 6 | 25,0.69 7 | 30,0.73 8 | 35,0.78 9 | 40,0.81 10 | 45,0.86 11 | 50,0.87 12 | 55,0.89 13 | 60,0.9 14 | 65,0.9 15 | 70,0.9 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.64 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.68 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.54 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.61 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.67 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.41 4 | 15,0.56 5 | 20,0.63 6 | 25,0.69 7 | 30,0.73 8 | 35,0.78 9 | 40,0.81 10 | 45,0.86 11 | 50,0.87 12 | 55,0.89 13 | 60,0.9 14 | 65,0.9 15 | 70,0.9 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.68 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.65 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.69 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.81 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.41 4 | 15,0.56 5 | 20,0.62 6 | 25,0.68 7 | 30,0.72 8 | 35,0.77 9 | 40,0.8 10 | 45,0.85 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.91 17 | 80,0.91 18 | 85,0.91 19 | 90,0.92 20 | 95,0.93 21 | 100,0.94 22 | 105,0.94 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.41 4 | 15,0.56 5 | 20,0.62 6 | 25,0.68 7 | 30,0.72 8 | 35,0.77 9 | 40,0.8 10 | 45,0.85 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.91 17 | 80,0.91 18 | 85,0.91 19 | 90,0.92 20 | 95,0.93 21 | 100,0.94 22 | 105,0.94 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.13 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.48 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.58 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.48 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.19 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.18 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.23 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.28 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.21 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.34 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.37 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.41 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.45 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.46 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.41 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.38 4 | 15,0.54 5 | 20,0.6 6 | 25,0.68 7 | 30,0.72 8 | 35,0.76 9 | 40,0.8 10 | 45,0.86 11 | 50,0.87 12 | 55,0.89 13 | 60,0.9 14 | 65,0.9 15 | 70,0.9 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.94 23 | 110,0.94 24 | 115,0.94 25 | 120,0.94 26 | 125,0.94 27 | 130,0.94 28 | 135,0.94 29 | 140,0.94 30 | 145,0.94 31 | 150,0.94 32 | 155,0.94 33 | 160,0.96 34 | 165,0.96 35 | 170,0.96 36 | 175,0.96 37 | 180,0.96 38 | 185,0.96 39 | 190,0.96 40 | 195,0.97 41 | 200,0.97 42 | 205,0.97 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.07 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.51 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.58 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.51 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.05 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.06 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.09 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.13 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.07 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.26 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.25 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.21 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.33 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.23 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.27 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.37 4 | 15,0.52 5 | 20,0.6 6 | 25,0.68 7 | 30,0.73 8 | 35,0.77 9 | 40,0.8 10 | 45,0.86 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.91 17 | 80,0.91 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.07 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.51 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.55 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.48 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.1 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.07 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.13 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.16 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.13 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.28 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.26 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.28 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.3 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.35 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.33 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.37 4 | 15,0.52 5 | 20,0.59 6 | 25,0.68 7 | 30,0.72 8 | 35,0.76 9 | 40,0.8 10 | 45,0.85 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.94 23 | 110,0.94 24 | 115,0.94 25 | 120,0.94 26 | 125,0.94 27 | 130,0.94 28 | 135,0.94 29 | 140,0.94 30 | 145,0.94 31 | 150,0.94 32 | 155,0.94 33 | 160,0.96 34 | 165,0.96 35 | 170,0.96 36 | 175,0.96 37 | 180,0.96 38 | 185,0.96 39 | 190,0.96 40 | 195,0.97 41 | 200,0.97 42 | 205,0.97 43 | 210,0.97 44 | 215,0.97 45 | 220,0.97 46 | 225,0.97 47 | 230,0.97 48 | 235,0.97 49 | 240,0.97 50 | 245,0.98 51 | 250,0.98 52 | 255,0.98 53 | 260,0.98 54 | 265,0.98 55 | 270,0.98 56 | 275,0.98 57 | 280,0.98 58 | 285,0.98 59 | 290,0.98 60 | 295,0.98 61 | 300,0.98 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.09 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.58 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.56 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.54 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.1 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.1 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.17 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.22 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.18 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.29 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.26 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.29 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.37 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.43 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.42 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.4 4 | 15,0.55 5 | 20,0.62 6 | 25,0.69 7 | 30,0.73 8 | 35,0.78 9 | 40,0.8 10 | 45,0.86 11 | 50,0.86 12 | 55,0.88 13 | 60,0.9 14 | 65,0.9 15 | 70,0.9 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.94 23 | 110,0.94 24 | 115,0.94 25 | 120,0.94 26 | 125,0.94 27 | 130,0.94 28 | 135,0.94 29 | 140,0.94 30 | 145,0.94 31 | 150,0.94 32 | 155,0.94 33 | 160,0.96 34 | 165,0.96 35 | 170,0.96 36 | 175,0.96 37 | 180,0.96 38 | 185,0.96 39 | 190,0.96 40 | 195,0.97 41 | 200,0.97 42 | 205,0.97 43 | 210,0.97 44 | 215,0.97 45 | 220,0.97 46 | 225,0.97 47 | 230,0.97 48 | 235,0.97 49 | 240,0.97 50 | 245,0.98 51 | 250,0.98 52 | 255,0.98 53 | 260,0.98 54 | 265,0.98 55 | 270,0.98 56 | 275,0.98 57 | 280,0.98 58 | 285,0.98 59 | 290,0.98 60 | 295,0.98 61 | 300,0.98 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.07 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.47 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.43 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.37 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.05 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.08 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.13 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.09 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.14 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.22 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.19 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.21 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.25 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.29 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.29 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.38 4 | 15,0.53 5 | 20,0.6 6 | 25,0.68 7 | 30,0.73 8 | 35,0.77 9 | 40,0.8 10 | 45,0.86 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.91 17 | 80,0.91 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.09 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.55 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.53 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.55 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.08 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.1 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.13 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.14 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.18 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.23 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.34 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.31 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.24 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.36 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.32 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.37 4 | 15,0.52 5 | 20,0.59 6 | 25,0.68 7 | 30,0.73 8 | 35,0.77 9 | 40,0.8 10 | 45,0.86 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.91 17 | 80,0.91 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2-all_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.54 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2-all_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.48 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.67 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.49 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.53 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.55 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.59 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.64 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.65 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.64 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.68 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.63 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.67 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.38 4 | 15,0.54 5 | 20,0.61 6 | 25,0.68 7 | 30,0.72 8 | 35,0.76 9 | 40,0.8 10 | 45,0.86 11 | 50,0.87 12 | 55,0.89 13 | 60,0.9 14 | 65,0.9 15 | 70,0.9 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.94 22 | 105,0.94 23 | 110,0.94 24 | 115,0.94 25 | 120,0.94 26 | 125,0.94 27 | 130,0.94 28 | 135,0.94 29 | 140,0.94 30 | 145,0.94 31 | 150,0.94 32 | 155,0.94 33 | 160,0.96 34 | 165,0.96 35 | 170,0.96 36 | 175,0.96 37 | 180,0.96 38 | 185,0.96 39 | 190,0.96 40 | 195,0.97 41 | 200,0.97 42 | 205,0.97 43 | 210,0.97 44 | 215,0.97 45 | 220,0.97 46 | 225,0.97 47 | 230,0.97 48 | 235,0.97 49 | 240,0.97 50 | 245,0.98 51 | 250,0.98 52 | 255,0.98 53 | 260,0.98 54 | 265,0.98 55 | 270,0.98 56 | 275,0.98 57 | 280,0.98 58 | 285,0.98 59 | 290,0.98 60 | 295,0.98 61 | 300,0.98 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.68 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.67 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.82 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.39 4 | 15,0.54 5 | 20,0.62 6 | 25,0.69 7 | 30,0.73 8 | 35,0.77 9 | 40,0.8 10 | 45,0.85 11 | 50,0.85 12 | 55,0.87 13 | 60,0.88 14 | 65,0.88 15 | 70,0.88 16 | 75,0.91 17 | 80,0.91 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.62 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.63 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.81 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.37 4 | 15,0.52 5 | 20,0.59 6 | 25,0.68 7 | 30,0.72 8 | 35,0.75 9 | 40,0.78 10 | 45,0.84 11 | 50,0.84 12 | 55,0.86 13 | 60,0.88 14 | 65,0.89 15 | 70,0.89 16 | 75,0.91 17 | 80,0.91 18 | 85,0.91 19 | 90,0.92 20 | 95,0.93 21 | 100,0.94 22 | 105,0.93 23 | 110,0.94 24 | 115,0.94 25 | 120,0.94 26 | 125,0.94 27 | 130,0.94 28 | 135,0.94 29 | 140,0.94 30 | 145,0.94 31 | 150,0.94 32 | 155,0.94 33 | 160,0.96 34 | 165,0.96 35 | 170,0.96 36 | 175,0.96 37 | 180,0.96 38 | 185,0.96 39 | 190,0.96 40 | 195,0.97 41 | 200,0.97 42 | 205,0.97 43 | 210,0.97 44 | 215,0.97 45 | 220,0.97 46 | 225,0.97 47 | 230,0.97 48 | 235,0.97 49 | 240,0.97 50 | 245,0.98 51 | 250,0.98 52 | 255,0.98 53 | 260,0.98 54 | 265,0.98 55 | 270,0.98 56 | 275,0.98 57 | 280,0.98 58 | 285,0.98 59 | 290,0.98 60 | 295,0.98 61 | 300,0.98 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.69 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.64 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.82 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.84 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.38 4 | 15,0.53 5 | 20,0.6 6 | 25,0.69 7 | 30,0.73 8 | 35,0.77 9 | 40,0.8 10 | 45,0.85 11 | 50,0.85 12 | 55,0.87 13 | 60,0.88 14 | 65,0.88 15 | 70,0.88 16 | 75,0.91 17 | 80,0.91 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.65 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.69 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.72 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.85 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.79 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.74 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.81 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.86 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.39 4 | 15,0.54 5 | 20,0.62 6 | 25,0.69 7 | 30,0.73 8 | 35,0.77 9 | 40,0.8 10 | 45,0.85 11 | 50,0.86 12 | 55,0.88 13 | 60,0.89 14 | 65,0.89 15 | 70,0.89 16 | 75,0.92 17 | 80,0.92 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.66 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.56 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.82 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.71 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.75 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.77 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.73 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.78 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.76 3 | } -------------------------------------------------------------------------------- /eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.25 3 | 10,0.38 4 | 15,0.54 5 | 20,0.61 6 | 25,0.68 7 | 30,0.73 8 | 35,0.76 9 | 40,0.79 10 | 45,0.85 11 | 50,0.85 12 | 55,0.87 13 | 60,0.88 14 | 65,0.88 15 | 70,0.88 16 | 75,0.91 17 | 80,0.91 18 | 85,0.92 19 | 90,0.93 20 | 95,0.94 21 | 100,0.95 22 | 105,0.95 23 | 110,0.95 24 | 115,0.95 25 | 120,0.95 26 | 125,0.95 27 | 130,0.95 28 | 135,0.95 29 | 140,0.95 30 | 145,0.95 31 | 150,0.95 32 | 155,0.95 33 | 160,0.97 34 | 165,0.97 35 | 170,0.97 36 | 175,0.97 37 | 180,0.97 38 | 185,0.97 39 | 190,0.97 40 | 195,0.98 41 | 200,0.98 42 | 205,0.98 43 | 210,0.98 44 | 215,0.98 45 | 220,0.98 46 | 225,0.98 47 | 230,0.98 48 | 235,0.98 49 | 240,0.98 50 | 245,0.99 51 | 250,0.99 52 | 255,0.99 53 | 260,0.99 54 | 265,0.99 55 | 270,0.99 56 | 275,0.99 57 | 280,0.99 58 | 285,0.99 59 | 290,0.99 60 | 295,0.99 61 | 300,0.99 62 | -------------------------------------------------------------------------------- /eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 100, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 30000, 5 | "accuracy": 0.9986666666666667, 6 | "mpk": { 7 | "mp1": { 8 | "mpk": 0.99, 9 | "mpk_adapt": 1.0, 10 | "filter_mpk": 1.0, 11 | "n_example_filter": 99 12 | }, 13 | "mp3": { 14 | "mpk": 0.9733333333333334, 15 | "mpk_adapt": 1.0, 16 | "filter_mpk": 1.0, 17 | "n_example_filter": 96 18 | }, 19 | "mp5": { 20 | "mpk": 0.96, 21 | "mpk_adapt": 0.997979797979798, 22 | "filter_mpk": 0.9978723404255319, 23 | "n_example_filter": 94 24 | }, 25 | "mp10": { 26 | "mpk": 0.888, 27 | "mpk_adapt": 0.9961840628507295, 28 | "filter_mpk": 0.9986111111111112, 29 | "n_example_filter": 72 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 100, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 30000, 5 | "accuracy": 0.9993, 6 | "mpk": { 7 | "mp1": { 8 | "mpk": 0.99, 9 | "mpk_adapt": 1.0, 10 | "filter_mpk": 1.0, 11 | "n_example_filter": 99 12 | }, 13 | "mp3": { 14 | "mpk": 0.9733333333333334, 15 | "mpk_adapt": 1.0, 16 | "filter_mpk": 1.0, 17 | "n_example_filter": 96 18 | }, 19 | "mp5": { 20 | "mpk": 0.9620000000000001, 21 | "mpk_adapt": 1.0, 22 | "filter_mpk": 1.0, 23 | "n_example_filter": 94 24 | }, 25 | "mp10": { 26 | "mpk": 0.8889999999999999, 27 | "mpk_adapt": 0.9988776655443322, 28 | "filter_mpk": 1.0, 29 | "n_example_filter": 72 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 100, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 30000, 5 | "accuracy": 0.9986666666666667, 6 | "mpk": { 7 | "mp1": { 8 | "mpk": 0.99, 9 | "mpk_adapt": 1.0, 10 | "filter_mpk": 1.0, 11 | "n_example_filter": 99 12 | }, 13 | "mp3": { 14 | "mpk": 0.9733333333333334, 15 | "mpk_adapt": 1.0, 16 | "filter_mpk": 1.0, 17 | "n_example_filter": 96 18 | }, 19 | "mp5": { 20 | "mpk": 0.9620000000000001, 21 | "mpk_adapt": 1.0, 22 | "filter_mpk": 1.0, 23 | "n_example_filter": 94 24 | }, 25 | "mp10": { 26 | "mpk": 0.8889999999999999, 27 | "mpk_adapt": 0.9988776655443322, 28 | "filter_mpk": 1.0, 29 | "n_example_filter": 72 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 100, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 30000, 5 | "accuracy": 0.9989, 6 | "mpk": { 7 | "mp1": { 8 | "mpk": 0.99, 9 | "mpk_adapt": 1.0, 10 | "filter_mpk": 1.0, 11 | "n_example_filter": 99 12 | }, 13 | "mp3": { 14 | "mpk": 0.9733333333333334, 15 | "mpk_adapt": 1.0, 16 | "filter_mpk": 1.0, 17 | "n_example_filter": 96 18 | }, 19 | "mp5": { 20 | "mpk": 0.9620000000000001, 21 | "mpk_adapt": 1.0, 22 | "filter_mpk": 1.0, 23 | "n_example_filter": 94 24 | }, 25 | "mp10": { 26 | "mpk": 0.8889999999999999, 27 | "mpk_adapt": 0.9971941638608306, 28 | "filter_mpk": 1.0, 29 | "n_example_filter": 72 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 100, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 30000, 5 | "accuracy": 0.9991333333333333, 6 | "mpk": { 7 | "mp1": { 8 | "mpk": 0.99, 9 | "mpk_adapt": 1.0, 10 | "filter_mpk": 1.0, 11 | "n_example_filter": 99 12 | }, 13 | "mp3": { 14 | "mpk": 0.9733333333333334, 15 | "mpk_adapt": 1.0, 16 | "filter_mpk": 1.0, 17 | "n_example_filter": 96 18 | }, 19 | "mp5": { 20 | "mpk": 0.96, 21 | "mpk_adapt": 0.997979797979798, 22 | "filter_mpk": 0.9978723404255319, 23 | "n_example_filter": 94 24 | }, 25 | "mp10": { 26 | "mpk": 0.888, 27 | "mpk_adapt": 0.9961840628507295, 28 | "filter_mpk": 0.9986111111111112, 29 | "n_example_filter": 72 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 100, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 30000, 5 | "accuracy": 0.9993, 6 | "mpk": { 7 | "mp1": { 8 | "mpk": 0.99, 9 | "mpk_adapt": 1.0, 10 | "filter_mpk": 1.0, 11 | "n_example_filter": 99 12 | }, 13 | "mp3": { 14 | "mpk": 0.97, 15 | "mpk_adapt": 0.9966329966329966, 16 | "filter_mpk": 0.9965277777777778, 17 | "n_example_filter": 96 18 | }, 19 | "mp5": { 20 | "mpk": 0.96, 21 | "mpk_adapt": 0.997979797979798, 22 | "filter_mpk": 0.9978723404255319, 23 | "n_example_filter": 94 24 | }, 25 | "mp10": { 26 | "mpk": 0.8889999999999999, 27 | "mpk_adapt": 0.9973063973063974, 28 | "filter_mpk": 0.9986111111111112, 29 | "n_example_filter": 72 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator/test/all_metrics_n100_llama7b-2-ep2.csv: -------------------------------------------------------------------------------- 1 | n_solution,pass@k,pass@majority 2 | 5,0.6292645943896892,0.41546626231993933 3 | 10,0.7247915087187263,0.4890068233510235 4 | 15,0.7922668688400303,0.514783927217589 5 | 20,0.8233510235026535,0.5291887793783169 6 | 25,0.844579226686884,0.5413191811978771 7 | 30,0.8597422289613343,0.5511751326762699 8 | 35,0.8688400303260045,0.558756633813495 9 | 40,0.8832448824867324,0.5640636846095527 10 | 45,0.88855193328279,0.5678544351781653 11 | 50,0.8961334344200151,0.5595147839272175 12 | 55,0.9044730856709629,0.5625473843821076 13 | 60,0.910538286580743,0.5678544351781653 14 | 65,0.9173616376042456,0.5701288855193328 15 | 70,0.9226686884003032,0.5724033358605004 16 | 75,0.9249431387414708,0.574677786201668 17 | 80,0.9302501895375285,0.5769522365428355 18 | 85,0.931008339651251,0.5769522365428355 19 | 90,0.9347990902198635,0.5799848369977255 20 | 95,0.9370735405610311,0.5716451857467779 21 | 100,0.9438968915845337,0.5731614859742229 22 | -------------------------------------------------------------------------------- /eval_results/gsm8k/generator/test/all_metrics_n300_mistral7b-ep2.csv: -------------------------------------------------------------------------------- 1 | n_solution,pass@k,pass@majority 2 | 5,0.7907505686125853,0.6322971948445792 3 | 10,0.8673237300985596,0.6679302501895376 4 | 15,0.8991660348749052,0.6853677028051555 5 | 20,0.9166034874905231,0.6974981046247157 6 | 25,0.9287338893100834,0.6952236542835482 7 | 30,0.9370735405610311,0.7035633055344959 8 | 35,0.9446550416982562,0.7073540561031084 9 | 40,0.9492039423805914,0.711144806671721 10 | 45,0.9514783927217589,0.7088703563305534 11 | 50,0.956027293404094,0.7156937073540561 12 | 55,0.959059893858984,0.7179681576952237 13 | 60,0.9628506444275967,0.7194844579226687 14 | 65,0.9666413949962093,0.7240333586050038 15 | 70,0.9673995451099318,0.7225170583775588 16 | 75,0.9681576952236542,0.7156937073540561 17 | 80,0.9704321455648218,0.7194844579226687 18 | 85,0.9719484457922669,0.7240333586050038 19 | 90,0.9727065959059894,0.7255496588324488 20 | 95,0.9742228961334344,0.7232752084912812 21 | 100,0.976497346474602,0.7270659590598939 22 | 105,0.9772554965883244,0.7255496588324488 23 | 110,0.978013646702047,0.7247915087187263 24 | 115,0.978013646702047,0.7263078089461713 25 | 120,0.978013646702047,0.7285822592873389 26 | 125,0.979529946929492,0.7308567096285065 27 | 130,0.9818043972706596,0.7308567096285065 28 | 135,0.9818043972706596,0.733131159969674 29 | 140,0.9833206974981046,0.7293404094010614 30 | 145,0.9833206974981046,0.7293404094010614 31 | 150,0.9840788476118272,0.7278241091736164 32 | 155,0.9848369977255497,0.7285822592873389 33 | 160,0.9848369977255497,0.730098559514784 34 | 165,0.9848369977255497,0.730098559514784 35 | 170,0.9848369977255497,0.730098559514784 36 | 175,0.9848369977255497,0.731614859742229 37 | 180,0.9848369977255497,0.7308567096285065 38 | 185,0.9855951478392722,0.7308567096285065 39 | 190,0.9855951478392722,0.7293404094010614 40 | 195,0.9871114480667172,0.730098559514784 41 | 200,0.9878695981804397,0.7308567096285065 42 | 205,0.9878695981804397,0.730098559514784 43 | 210,0.9886277482941622,0.7308567096285065 44 | 215,0.9893858984078847,0.733131159969674 45 | 220,0.9893858984078847,0.7338893100833965 46 | 225,0.9901440485216073,0.7338893100833965 47 | 230,0.9901440485216073,0.7338893100833965 48 | 235,0.9901440485216073,0.7323730098559514 49 | 240,0.9901440485216073,0.7323730098559514 50 | 245,0.9901440485216073,0.7293404094010614 51 | 250,0.9901440485216073,0.7285822592873389 52 | 255,0.9901440485216073,0.7263078089461713 53 | 260,0.9901440485216073,0.7270659590598939 54 | 265,0.9901440485216073,0.7285822592873389 55 | 270,0.9909021986353298,0.730098559514784 56 | 275,0.9909021986353298,0.7285822592873389 57 | 280,0.9916603487490523,0.7285822592873389 58 | 285,0.9916603487490523,0.7278241091736164 59 | 290,0.9916603487490523,0.7285822592873389 60 | 295,0.9916603487490523,0.7285822592873389 61 | 300,0.9916603487490523,0.7270659590598939 62 | -------------------------------------------------------------------------------- /eval_results/gsm8k/generator/train/all_metrics_n100_llama7b-2-ep2.csv: -------------------------------------------------------------------------------- 1 | n_solution,pass@k,pass@majority 2 | 5,0.6716178241670012,0.48467817476247826 3 | 10,0.7707747892412685,0.5517195236183594 4 | 15,0.8170748026227753,0.5727284892278871 5 | 20,0.8455774120165931,0.5878495918640438 6 | 25,0.8676568981667335,0.598420982202596 7 | 30,0.8823765556001606,0.6002943931486685 8 | 35,0.8954904322226682,0.6048441054462732 9 | 40,0.9039207814799947,0.6112672286899505 10 | 45,0.9114144252642847,0.6143449752442125 11 | 50,0.918506623845845,0.6144787903117891 12 | 55,0.9235915964137562,0.6176903519336278 13 | 60,0.9296132744547037,0.6200990231500066 14 | 65,0.9336277264820019,0.6191623176769704 15 | 70,0.9383112538471832,0.6215709888933494 16 | 75,0.9413890004014452,0.6213033587581962 17 | 80,0.9444667469557072,0.6213033587581962 18 | 85,0.9467416031045096,0.6246487354476113 19 | 90,0.9488826441857353,0.6235782149069985 20 | 95,0.9512913154021143,0.6246487354476113 21 | 100,0.9527632811454569,0.6245149203800348 22 | -------------------------------------------------------------------------------- /eval_results/gsm8k/generator/train/all_metrics_n100_mistral7b-ep2.csv: -------------------------------------------------------------------------------- 1 | n_solution,pass@k,pass@majority 2 | 5,0.8731433159373746,0.7490967482938579 3 | 10,0.9211829251973772,0.7903117891074535 4 | 15,0.9408537401311388,0.8030242205272313 5 | 20,0.9542352468887997,0.8085106382978723 6 | 25,0.9606583701324769,0.8157366519470093 7 | 30,0.9656095276328115,0.8194834738391543 8 | 35,0.9689549043222266,0.8222935902582631 9 | 40,0.9720326508764887,0.8237655560016057 10 | 45,0.9741736919577144,0.8243008162719122 11 | 50,0.9760471029037869,0.8272447477585976 12 | 55,0.9785895891877425,0.8293857888398234 13 | 60,0.9803291850662385,0.8267094874882912 14 | 65,0.9811320754716981,0.8283152682992105 15 | 70,0.9827378562826175,0.8299210491101298 16 | 75,0.9838083768232303,0.8305901244480128 17 | 80,0.9848788973638432,0.8313930148534725 18 | 85,0.9860832329720326,0.8320620901913556 19 | 90,0.9863508631071859,0.8321959052589322 20 | 95,0.9867523083099157,0.833266425799545 21 | 100,0.9871537535126456,0.8338016860698515 22 | -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6702047005307051 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.714177407126611 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7043214556482184 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7119029567854435 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6777862016679302 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6868840030326004 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6944655041698257 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam20.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7164518574677786 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam20_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7179681576952237 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam20_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7278241091736164 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam25.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7232752084912812 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam25_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7323730098559514 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam25_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7361637604245641 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6830932524639879 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6921910538286581 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7012888551933283 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7020470053070508 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7156937073540561 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7012888551933283 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam50.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7422289613343442 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam50_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7376800606520091 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam50_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7323730098559514 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6974981046247157 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7035633055344959 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6664139499620925 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6952236542835482 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6846095526914329 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.690674753601213 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6664139499620925 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6580742987111448 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6702047005307051 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6785443517816527 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6755117513267627 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6937073540561031 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.686125852918878 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.689158453373768 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6899166034874905 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6876421531463229 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.6868840030326004 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8112206216830933 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8385140257771039 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8483699772554966 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8392721758908264 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8180439727065959 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8074298711144806 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8218347232752085 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam20.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8430629264594389 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam20_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8438210765731615 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam20_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8415466262319939 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam25.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8476118271417741 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam25_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8506444275966641 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam25_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8438210765731615 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.819560272934041 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8188021228203184 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8332069749810462 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam50.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8521607278241091 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam50_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8415466262319939 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam50_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8468536770280516 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.824109173616376 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8332069749810462 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam1.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8119787717968158 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam10.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8248673237300985 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam10_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8256254738438211 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam10_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8278999241849886 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam1_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.796057619408643 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam1_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.7945413191811979 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam2.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.819560272934041 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam2_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8119787717968158 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam2_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8188021228203184 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam4.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.819560272934041 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam4_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8248673237300985 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam4_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8188021228203184 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam5.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.824109173616376 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam5_41.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8233510235026535 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam5_43.json: -------------------------------------------------------------------------------- 1 | { 2 | "accuracy": 0.8150113722517058 3 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_g(mistral7b-ep2)_v(mistral7b-ep2-n100-scahead-mse-lm-token).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.7407126611068992 3 | 10,0.7922668688400303 4 | 15,0.8157695223654283 5 | 20,0.821076573161486 6 | 25,0.8294162244124337 7 | 30,0.8332069749810462 8 | 35,0.8316906747536013 9 | 40,0.8347232752084913 10 | 45,0.8415466262319939 11 | 50,0.8430629264594389 12 | 55,0.8423047763457164 13 | 60,0.846095526914329 14 | 65,0.8476118271417741 15 | 70,0.8514025777103866 16 | 75,0.8514025777103866 17 | 80,0.849128127369219 18 | 85,0.8476118271417741 19 | 90,0.8468536770280516 20 | 95,0.8468536770280516 21 | 100,0.846095526914329 22 | 105,0.8453373768006065 23 | 110,0.8453373768006065 24 | 115,0.8468536770280516 25 | 120,0.846095526914329 26 | 125,0.849128127369219 27 | 130,0.8483699772554966 28 | 135,0.8483699772554966 29 | 140,0.8468536770280516 30 | 145,0.846095526914329 31 | 150,0.8483699772554966 32 | 155,0.8498862774829417 33 | 160,0.8498862774829417 34 | 165,0.8498862774829417 35 | 170,0.8506444275966641 36 | 175,0.8498862774829417 37 | 180,0.8514025777103866 38 | 185,0.8506444275966641 39 | 190,0.8506444275966641 40 | 195,0.849128127369219 41 | 200,0.849128127369219 42 | 205,0.8498862774829417 43 | 210,0.8498862774829417 44 | 215,0.8506444275966641 45 | 220,0.8514025777103866 46 | 225,0.8514025777103866 47 | 230,0.8529188779378317 48 | 235,0.8544351781652767 49 | 240,0.8567096285064443 50 | 245,0.8559514783927218 51 | 250,0.8559514783927218 52 | 255,0.8559514783927218 53 | 260,0.8559514783927218 54 | 265,0.8559514783927218 55 | 270,0.8551933282789992 56 | 275,0.8544351781652767 57 | 280,0.8559514783927218 58 | 285,0.8559514783927218 59 | 290,0.8559514783927218 60 | 295,0.8567096285064443 61 | 300,0.8567096285064443 62 | -------------------------------------------------------------------------------- /eval_results/gsm8k/generator_with_verifier/train/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token).csv: -------------------------------------------------------------------------------- 1 | n_solution,acc 2 | 5,0.63575538605647 3 | 10,0.7090860430884518 4 | 15,0.7371872072795397 5 | 20,0.7539140907266159 6 | 25,0.7664927070788171 7 | 30,0.7758597618091797 8 | 35,0.7825505151880102 9 | 40,0.7876354877559213 10 | 45,0.7928542753914091 11 | 50,0.7964672822159775 12 | 55,0.8003479191756991 13 | 60,0.8032918506623846 14 | 65,0.80623578214907 15 | 70,0.808644453365449 16 | 75,0.8094473437709087 17 | 80,0.809313528703332 18 | 85,0.8106516793790981 19 | 90,0.8105178643115215 20 | 95,0.8111869396494045 21 | 100,0.8127927204603238 22 | -------------------------------------------------------------------------------- /eval_results/gsm8k/verifier/test/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 1319, 3 | "#solution_per_problem": 500, 4 | "#total_solutions": 659500, 5 | "accuracy": 0.9066353297952995, 6 | "mp1": 0.7293404094010614 7 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/verifier/test/metrics_v(mistral7b-ep2-n100-scahead-mse-lm-token)_g(mistral7b-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 1319, 3 | "#solution_per_problem": 300, 4 | "#total_solutions": 395700, 5 | "accuracy": 0.9070710133939853, 6 | "mp1": 0.8567096285064443 7 | } -------------------------------------------------------------------------------- /eval_results/gsm8k/verifier/train/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json: -------------------------------------------------------------------------------- 1 | { 2 | "#question": 7473, 3 | "#solution_per_problem": 100, 4 | "#total_solutions": 747300, 5 | "accuracy": 0.9572862304295464, 6 | "mp1": 0.8127927204603238 7 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.1 2 | deepspeed==0.10.0 3 | accelerate==0.20.3 4 | transformers==4.34.1 5 | wandb==0.15.4 -------------------------------------------------------------------------------- /scripts/game24/eval_step_beam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | verifier_id=n100-scahead-mse-lm-token-bsz128-ep10 5 | 6 | n_beam=4 7 | n_sampling_steps=20 8 | 9 | 10 | model_name_or_path=~/models/game24/generators/${generator_id} 11 | verifier_model_name_or_path=~/models/game24/verifiers/${generator_id}-${verifier_id} 12 | 13 | 14 | accelerate launch \ 15 | --main_process_port=20659 \ 16 | eval_generator_by_step.py \ 17 | --model_name_or_path ${model_name_or_path} \ 18 | --verifier_model_name_or_path ${verifier_model_name_or_path} \ 19 | --dataset game24 \ 20 | --data_dir data/game24 \ 21 | --output_dir eval_results/game24/generator_with_verifier \ 22 | --target_set mid \ 23 | --inference_mode beam \ 24 | --batch_size 30 \ 25 | --vs_batch_size 64 \ 26 | --n_beam ${n_beam} \ 27 | --n_sampling_steps ${n_sampling_steps} \ 28 | --max_n_step 10 \ 29 | --max_step_length 100 \ 30 | --dedup_mode 1 \ 31 | --do_sample True \ 32 | --temperature 0.7 \ 33 | --top_k 50 \ 34 | --top_p 1.0 \ 35 | --max_new_tokens 400 \ 36 | --seed 42 37 | 38 | 39 | -------------------------------------------------------------------------------- /scripts/game24/eval_with_verifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | verifier_id=n100-scahead-mse-lm-token-bsz128-ep10 5 | verifier_model_name_or_path=~/models/game24/verifiers/${generator_id}-${verifier_id} 6 | 7 | accelerate launch \ 8 | --main_process_port=29510 \ 9 | eval_with_verifier.py \ 10 | --model_name_or_path ${verifier_model_name_or_path} \ 11 | --data_dir data/game24/model_generation \ 12 | --verifier_output_dir eval_results/game24/verifier \ 13 | --generator_metric_dir eval_results/game24/generator_with_verifier \ 14 | --generator_id ${generator_id} \ 15 | --target_set mid \ 16 | --batch_size 64 \ 17 | --seed 42 18 | 19 | 20 | -------------------------------------------------------------------------------- /scripts/game24/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | n_solutions=100 5 | 6 | model_name_or_path=~/models/game24/generators/${generator_id} 7 | 8 | accelerate launch \ 9 | --main_process_port=20658 \ 10 | generate_paths_and_eval.py \ 11 | --model_name_or_path ${model_name_or_path} \ 12 | --dataset game24 \ 13 | --data_dir data/game24 \ 14 | --output_dir data/game24/model_generation \ 15 | --metric_output_dir eval_results/game24/generator \ 16 | --target_set train \ 17 | --n_solutions ${n_solutions} \ 18 | --batch_size 16 \ 19 | --do_sample True \ 20 | --temperature 0.7 \ 21 | --top_k 50 \ 22 | --top_p 1.0 \ 23 | --max_new_tokens 400 24 | -------------------------------------------------------------------------------- /scripts/game24/greedy_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | model_name_or_path=~/models/game24/generators/${generator_id} 5 | 6 | accelerate launch \ 7 | --main_process_port=20659 \ 8 | generate_paths_and_eval.py \ 9 | --model_name_or_path ${model_name_or_path} \ 10 | --dataset game24 \ 11 | --data_dir data/game24 \ 12 | --output_dir eval_results/game24/generator \ 13 | --metric_output_dir eval_results/game24/generator \ 14 | --target_set mid \ 15 | --batch_size 32 \ 16 | --do_sample False \ 17 | --max_new_tokens 400 \ 18 | --seed 42 19 | -------------------------------------------------------------------------------- /scripts/game24/train_generator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_API_KEY= 4 | export WANDB_PROJECT=GAME24-Generator-Finetune 5 | export WANDB_ENTITY= 6 | 7 | 8 | model_name_or_path=meta-llama/Llama-2-7b-hf 9 | save_generator_id=llama7b-2-ep2 10 | 11 | save_dir=~/models/game24/generators/${save_generator_id}/ 12 | export WANDB_NAME=${save_generator_id} 13 | 14 | 15 | 16 | 17 | accelerate launch \ 18 | --config_file ./configs/zero1.yaml \ 19 | --main_process_port=20650 \ 20 | train_generator.py \ 21 | --model_name_or_path ${model_name_or_path} \ 22 | --dataset game24 \ 23 | --data_dir data/game24 \ 24 | --target_set train \ 25 | --save_dir ${save_dir} \ 26 | --num_train_epoches 2 \ 27 | --eval_steps 200 \ 28 | --per_device_train_batch_size 16 \ 29 | --per_device_eval_batch_size 4 \ 30 | --gradient_accumulation_steps 1 \ 31 | --gradient_checkpointing True \ 32 | --learning_rate 1e-5 \ 33 | --weight_decay 0 \ 34 | --lr_scheduler_type "linear" \ 35 | --warmup_steps 0 \ 36 | --save_steps 200 \ 37 | --save_best False \ 38 | --save_total_limit 0 \ 39 | --logging_dir ./wandb \ 40 | --logging_steps 8 \ 41 | --seed 42 42 | 43 | -------------------------------------------------------------------------------- /scripts/game24/train_verifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_API_KEY= 4 | export WANDB_PROJECT=GAME24-Verifier 5 | export WANDB_ENTITY= 6 | 7 | 8 | n_solution=100 9 | generator_id=llama7b-2-ep2 10 | save_verifier_id=n${n_solution}-scahead-mse-lm-token-bsz128-ep10 11 | 12 | 13 | checkpoint_dir=~/models/game24/generators/${generator_id} 14 | 15 | final_id=${generator_id}-${save_verifier_id} 16 | save_dir=~/models/game24/verifiers/${final_id} 17 | export WANDB_NAME=${final_id} 18 | 19 | 20 | 21 | accelerate launch \ 22 | --config_file ./configs/zero1.yaml \ 23 | --main_process_port=20104 \ 24 | train_verifier.py \ 25 | --model_name_or_path ${checkpoint_dir} \ 26 | --data_dir data/game24/model_generation \ 27 | --target_set train \ 28 | --save_dir ${save_dir} \ 29 | --generator_id ${generator_id} \ 30 | --dedup True \ 31 | --per_problem_sampling_solution ${n_solution} \ 32 | --loss_level token \ 33 | --loss_on_llm True \ 34 | --num_train_epoches 10 \ 35 | --eval_steps 1000 \ 36 | --per_device_train_batch_size 16 \ 37 | --per_device_eval_batch_size 64 \ 38 | --gradient_accumulation_steps 1 \ 39 | --gradient_checkpointing True \ 40 | --learning_rate 1e-5 \ 41 | --weight_decay 0 \ 42 | --lr_scheduler_type "linear" \ 43 | --warmup_steps 0 \ 44 | --save_epoches 2 \ 45 | --save_best False \ 46 | --save_total_limit 10 \ 47 | --logging_dir ./wandb \ 48 | --logging_steps 20 \ 49 | --seed 42 50 | 51 | -------------------------------------------------------------------------------- /scripts/game24/train_verifier_process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_API_KEY= 4 | export WANDB_PROJECT=GAME24-Verifier 5 | export WANDB_ENTITY= 6 | 7 | 8 | n_solution=100 9 | generator_id=llama7b-2-ep2 10 | save_verifier_id=n${n_solution}-scahead-mse-lm-token-bsz128-ep10 11 | 12 | 13 | checkpoint_dir=~/models/game24/generators/${generator_id} 14 | 15 | final_id=prm-${generator_id}-${save_verifier_id} 16 | save_dir=~/models/game24/verifiers/${final_id} 17 | export WANDB_NAME=${final_id} 18 | 19 | 20 | 21 | accelerate launch \ 22 | --config_file ./configs/zero1.yaml \ 23 | --main_process_port=20104 \ 24 | train_verifier.py \ 25 | --model_name_or_path ${checkpoint_dir} \ 26 | --data_dir data/game24/model_generation \ 27 | --target_set train \ 28 | --save_dir ${save_dir} \ 29 | --generator_id ${generator_id} \ 30 | --dedup True \ 31 | --per_problem_sampling_solution ${n_solution} \ 32 | --loss_level token \ 33 | --loss_on_llm True \ 34 | --process True \ 35 | --num_train_epoches 10 \ 36 | --eval_steps 1000 \ 37 | --per_device_train_batch_size 16 \ 38 | --per_device_eval_batch_size 64 \ 39 | --gradient_accumulation_steps 1 \ 40 | --gradient_checkpointing True \ 41 | --learning_rate 1e-5 \ 42 | --weight_decay 0 \ 43 | --lr_scheduler_type "linear" \ 44 | --warmup_steps 0 \ 45 | --save_epoches 2 \ 46 | --save_best False \ 47 | --save_total_limit 10 \ 48 | --logging_dir ./wandb \ 49 | --logging_steps 20 \ 50 | --seed 42 51 | 52 | -------------------------------------------------------------------------------- /scripts/gsm8k/eval_step_beam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | verifier_id=n100-scahead-mse-lm-token 5 | 6 | n_beam=4 7 | n_sampling_steps=20 8 | 9 | 10 | model_name_or_path=~/models/gsm8k/generators/${generator_id} 11 | verifier_model_name_or_path=~/models/gsm8k/verifiers/${generator_id}-${verifier_id} 12 | 13 | 14 | accelerate launch \ 15 | --main_process_port=20659 \ 16 | eval_generator_by_step.py \ 17 | --model_name_or_path ${model_name_or_path} \ 18 | --verifier_model_name_or_path ${verifier_model_name_or_path} \ 19 | --dataset gsm8k \ 20 | --data_dir data/gsm8k \ 21 | --output_dir eval_results/gsm8k/generator_with_verifier \ 22 | --target_set test \ 23 | --inference_mode beam \ 24 | --batch_size 26 \ 25 | --vs_batch_size 64 \ 26 | --n_beam ${n_beam} \ 27 | --n_sampling_steps ${n_sampling_steps} \ 28 | --max_n_step 10 \ 29 | --max_step_length 100 \ 30 | --dedup_mode 0 \ 31 | --do_sample True \ 32 | --temperature 0.7 \ 33 | --top_k 50 \ 34 | --top_p 1.0 \ 35 | --max_new_tokens 400 \ 36 | --seed 42 37 | -------------------------------------------------------------------------------- /scripts/gsm8k/eval_with_verifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | verifier_id=n100-scahead-mse-lm-token 5 | verifier_model_name_or_path=~/models/gsm8k/verifiers/${generator_id}-${verifier_id} 6 | 7 | 8 | accelerate launch \ 9 | --main_process_port=29510 \ 10 | eval_with_verifier.py \ 11 | --model_name_or_path ${verifier_model_name_or_path} \ 12 | --data_dir data/gsm8k/model_generation \ 13 | --verifier_output_dir eval_results/gsm8k/verifier \ 14 | --generator_metric_dir eval_results/gsm8k/generator_with_verifier \ 15 | --generator_id ${generator_id} \ 16 | --target_set test \ 17 | --batch_size 64 \ 18 | --seed 42 19 | 20 | -------------------------------------------------------------------------------- /scripts/gsm8k/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | n_solutions=100 5 | 6 | model_name_or_path=~/models/gsm8k/generators/${generator_id} 7 | 8 | accelerate launch \ 9 | --main_process_port=20658 \ 10 | generate_paths_and_eval.py \ 11 | --model_name_or_path ${model_name_or_path} \ 12 | --dataset gsm8k \ 13 | --data_dir data/gsm8k \ 14 | --output_dir data/gsm8k/model_generation \ 15 | --metric_output_dir eval_results/gsm8k/generator \ 16 | --target_set train \ 17 | --n_solutions ${n_solutions} \ 18 | --batch_size 16 \ 19 | --do_sample True \ 20 | --temperature 0.7 \ 21 | --top_k 50 \ 22 | --top_p 1.0 \ 23 | --max_new_tokens 400 24 | -------------------------------------------------------------------------------- /scripts/gsm8k/greedy_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | generator_id=llama7b-2-ep2 4 | model_name_or_path=~/models/gsm8k/generators/${generator_id} 5 | 6 | accelerate launch \ 7 | --main_process_port=20659 \ 8 | generate_paths_and_eval.py \ 9 | --model_name_or_path ${model_name_or_path} \ 10 | --dataset gsm8k \ 11 | --data_dir data/gsm8k \ 12 | --output_dir eval_results/gsm8k/generator \ 13 | --metric_output_dir eval_results/gsm8k/generator \ 14 | --target_set test \ 15 | --batch_size 32 \ 16 | --do_sample False \ 17 | --max_new_tokens 400 \ 18 | --seed 42 19 | -------------------------------------------------------------------------------- /scripts/gsm8k/train_generator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_API_KEY= 4 | export WANDB_PROJECT=GSM8K-Generator-Finetune 5 | export WANDB_ENTITY= 6 | 7 | 8 | model_name_or_path=meta-llama/Llama-2-7b-hf 9 | save_generator_id=llama7b-2-ep2 10 | 11 | save_dir=~/models/gsm8k/generators/${save_generator_id}/ 12 | export WANDB_NAME=${save_generator_id} 13 | 14 | 15 | 16 | 17 | accelerate launch \ 18 | --config_file ./configs/zero1.yaml \ 19 | --main_process_port=20650 \ 20 | train_generator.py \ 21 | --model_name_or_path ${model_name_or_path} \ 22 | --dataset gsm8k \ 23 | --data_dir data/gsm8k \ 24 | --target_set train \ 25 | --save_dir ${save_dir} \ 26 | --num_train_epoches 2 \ 27 | --eval_steps 200 \ 28 | --per_device_train_batch_size 16 \ 29 | --per_device_eval_batch_size 4 \ 30 | --gradient_accumulation_steps 1 \ 31 | --gradient_checkpointing True \ 32 | --learning_rate 1e-5 \ 33 | --weight_decay 0 \ 34 | --lr_scheduler_type "linear" \ 35 | --warmup_steps 0 \ 36 | --save_steps 200 \ 37 | --save_best False \ 38 | --save_total_limit 0 \ 39 | --logging_dir ./wandb \ 40 | --logging_steps 8 \ 41 | --seed 42 42 | 43 | -------------------------------------------------------------------------------- /scripts/gsm8k/train_verifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_API_KEY= 4 | export WANDB_PROJECT=GSM8K-Verifier 5 | export WANDB_ENTITY= 6 | 7 | 8 | n_solution=100 9 | generator_id=llama7b-2-ep2 10 | save_verifier_id=n${n_solution}-scahead-mse-lm-token 11 | 12 | 13 | checkpoint_dir=~/models/gsm8k/generators/${generator_id} 14 | 15 | final_id=${generator_id}-${save_verifier_id} 16 | save_dir=~/models/gsm8k/verifiers/${generator_id}-${experimentID} 17 | export WANDB_NAME=${generator_id}-${experimentID} 18 | 19 | 20 | 21 | accelerate launch \ 22 | --config_file ./configs/zero1.yaml \ 23 | --main_process_port=20104 \ 24 | train_verifier.py \ 25 | --model_name_or_path ${checkpoint_dir} \ 26 | --data_dir data/gsm8k/model_generation \ 27 | --target_set train \ 28 | --save_dir ${save_dir} \ 29 | --generator_id ${generator_id} \ 30 | --dedup True \ 31 | --per_problem_sampling_solution ${n_solution} \ 32 | --loss_level token \ 33 | --loss_on_llm True \ 34 | --num_train_epoches 1 \ 35 | --eval_steps 1000 \ 36 | --per_device_train_batch_size 64 \ 37 | --per_device_eval_batch_size 64 \ 38 | --gradient_accumulation_steps 1 \ 39 | --gradient_checkpointing True \ 40 | --learning_rate 1e-5 \ 41 | --weight_decay 0 \ 42 | --lr_scheduler_type "linear" \ 43 | --warmup_steps 0 \ 44 | --save_epoches 1 \ 45 | --save_best False \ 46 | --save_total_limit 0 \ 47 | --logging_dir ./wandb \ 48 | --logging_steps 20 \ 49 | --seed 42 50 | 51 | -------------------------------------------------------------------------------- /utils/constants.py: -------------------------------------------------------------------------------- 1 | IGNORE_INDEX = -100 2 | DEFAULT_PAD_TOKEN = "" 3 | DEFAULT_BOS_TOKEN = "" 4 | DEFAULT_EOS_TOKEN = "" 5 | DEFAULT_UNK_TOKEN = "" 6 | 7 | LLAMA_EQUALS_TOKENS = set([353, 3892, 29922, 10457]) # _=, )=, =, =- 8 | LLAMA_LEFTMARK_TOKENS = set([3532, 9314]) # <<, _<< 9 | LLAMA_RIGHTMARK_TOKEN = 6778 # >> 10 | LLAMA_NEWLINE_TOKEN = 13 # \n 11 | 12 | -------------------------------------------------------------------------------- /utils/flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | # refer to https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama2_flash_attn_monkey_patch.py 2 | 3 | import warnings 4 | from typing import Optional, Tuple 5 | 6 | import torch 7 | from flash_attn import __version__ as flash_attn_version 8 | from flash_attn.bert_padding import pad_input, unpad_input 9 | from flash_attn.flash_attn_interface import ( 10 | flash_attn_func, 11 | flash_attn_varlen_kvpacked_func, 12 | ) 13 | from transformers.models.llama.modeling_llama import ( 14 | LlamaAttention, 15 | LlamaModel, 16 | rotate_half, 17 | ) 18 | from transformers.models.mistral.modeling_mistral import ( 19 | MistralAttention, 20 | MistralModel, 21 | rotate_half, 22 | ) 23 | 24 | 25 | 26 | def apply_rotary_pos_emb(q, k, cos_sin, position_ids): 27 | gather_indices = position_ids[:, :, None, None] # [bsz, seq_len, 1, 1] 28 | gather_indices = gather_indices.repeat( 29 | 1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3] 30 | ) 31 | bsz = gather_indices.shape[0] 32 | cos, sin = ( 33 | torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices) 34 | for x in cos_sin 35 | ) 36 | q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k)) 37 | return q, k 38 | 39 | 40 | def forward( 41 | self, 42 | hidden_states: torch.Tensor, 43 | attention_mask: Optional[torch.Tensor] = None, 44 | position_ids: Optional[torch.Tensor] = None, 45 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 46 | output_attentions: bool = False, 47 | use_cache: bool = False, 48 | padding_mask: Optional[torch.Tensor] = None, 49 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 50 | if output_attentions: 51 | warnings.warn( 52 | "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead." 53 | ) 54 | 55 | bsz, q_len, _ = hidden_states.size() 56 | kv_heads = getattr(self, "num_key_value_heads", self.num_heads) 57 | 58 | q, k, v = ( 59 | op(hidden_states).view(bsz, q_len, nh, self.head_dim) 60 | for op, nh in ( 61 | (self.q_proj, self.num_heads), 62 | (self.k_proj, kv_heads), 63 | (self.v_proj, kv_heads), 64 | ) 65 | ) 66 | # shape: (b, s, num_heads, head_dim) 67 | 68 | kv_seq_len = k.shape[1] 69 | past_kv_len = 0 70 | if past_key_value is not None: 71 | past_kv_len = past_key_value[0].shape[2] 72 | kv_seq_len += past_kv_len 73 | 74 | cos_sin = self.rotary_emb(v, seq_len=kv_seq_len) 75 | q, k = apply_rotary_pos_emb(q, k, cos_sin, position_ids) 76 | 77 | if past_key_value is not None: 78 | assert ( 79 | flash_attn_version >= "2.1.0" 80 | ), "past_key_value support requires flash-attn >= 2.1.0" 81 | # reuse k, v 82 | k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1) 83 | v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1) 84 | 85 | past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None 86 | 87 | if attention_mask is None: 88 | output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view( 89 | bsz, q_len, -1 90 | ) 91 | else: 92 | q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:]) 93 | # We can skip concat and call unpad twice but seems better to call unpad only once. 94 | kv, _, cu_k_lens, max_k = unpad_input( 95 | torch.stack((k, v), dim=2), attention_mask 96 | ) 97 | output_unpad = flash_attn_varlen_kvpacked_func( 98 | q, 99 | kv, 100 | cu_q_lens, 101 | cu_k_lens, 102 | max_s, 103 | max_k, 104 | 0.0, 105 | softmax_scale=None, 106 | causal=True, 107 | ) 108 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim) 109 | output = pad_input(output_unpad, indices, bsz, q_len) 110 | 111 | return self.o_proj(output), None, past_key_value 112 | 113 | 114 | # Disable the transformation of the attention mask in LlamaModel as flash attention 115 | # takes a boolean key_padding_mask. Fills in the past kv length for use in forward. 116 | def _prepare_decoder_attention_mask( 117 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length 118 | ): 119 | 120 | if attention_mask is not None and torch.all(attention_mask): 121 | return None # This uses the faster call when training with full samples 122 | 123 | return attention_mask 124 | 125 | 126 | def replace_llama_attn_with_flash_attn(): 127 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 128 | if cuda_major < 8: 129 | warnings.warn( 130 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." 131 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 132 | ) 133 | 134 | LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask 135 | LlamaAttention.forward = forward 136 | 137 | -------------------------------------------------------------------------------- /utils/game24/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/OVM/e0a68c253568666dea61ef6dac910f9baa59ebbb/utils/game24/__init__.py -------------------------------------------------------------------------------- /utils/game24/decoding.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sympy 3 | from typing import List 4 | 5 | 6 | def extract_expression(response: str): 7 | return response.strip().split('\n')[-1].lower().split('the answer is ')[-1].split('=')[0].strip() 8 | 9 | 10 | def extract_expressions(responses: List[str]): 11 | return [extract_expression(response) for response in responses] 12 | 13 | 14 | # refer to https://github.com/princeton-nlp/tree-of-thought-llm/blob/master/src/tot/tasks/game24.py 15 | def get_answer_label(expression: str, question: str): 16 | numbers = re.findall(r'\d+', expression) 17 | problem_numbers = re.findall(r'\d+', question) 18 | if sorted(numbers) != sorted(problem_numbers): 19 | return False 20 | try: 21 | # print(sympy.simplify(expression)) 22 | return sympy.simplify(expression) == 24 23 | except Exception as e: 24 | # print(e) 25 | return False 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /utils/game24/metrics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from typing import Optional, List, Dict, Set, Any, Union 5 | import torch.distributed as dist 6 | import re 7 | from utils.game24.decoding import extract_expressions, get_answer_label 8 | 9 | 10 | 11 | class GeneratorAnswerAcc: 12 | def __init__(self, n_data: int): 13 | self.n_data = n_data 14 | 15 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1 16 | 17 | self.corrs = [] 18 | self.gather = False 19 | 20 | @torch.inference_mode(mode=True) 21 | def __call__(self, completions: List[str], questions: List[str]): 22 | expressions = extract_expressions(completions) 23 | 24 | corrs = [float(get_answer_label(expression, question)) == True for expression, question in zip(expressions, questions)] 25 | 26 | self.corrs.append(corrs) 27 | 28 | def get_metric(self, reset=True): 29 | if not self.gather: 30 | if self.world_size != 1: 31 | gathered_corrs = [None] * self.world_size 32 | for obj, container in [ 33 | (self.corrs, gathered_corrs), 34 | ]: 35 | dist.all_gather_object(container, obj) 36 | 37 | flatten_corrs = [] 38 | for corrs_gpus in zip(*gathered_corrs): 39 | for corrs in corrs_gpus: 40 | flatten_corrs.extend(corrs) 41 | 42 | else: 43 | flatten_corrs = [item for sublist in self.corrs for item in sublist] 44 | 45 | self.corrs = flatten_corrs[:self.n_data] 46 | self.gather = True 47 | 48 | acc = (sum(self.corrs) / len(self.corrs)) 49 | 50 | if reset: 51 | self.corrs = [] 52 | self.gather = False 53 | return acc 54 | 55 | 56 | class MultiSamplingAnswerAcc: 57 | def __init__(self, n_data: int = None): 58 | self.n_data = n_data 59 | 60 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1 61 | 62 | self.answers = [] 63 | self.questions = [] 64 | 65 | def start_new_sol_epoch(self): 66 | self.cur_answers = [] 67 | self.cur_questions = [] 68 | 69 | def end_the_sol_epoch(self): 70 | 71 | if self.world_size != 1: 72 | gathered_answers, gathered_questions = tuple([None] * self.world_size for _ in range(2)) 73 | for obj, container in [ 74 | (self.cur_answers, gathered_answers), 75 | (self.cur_questions, gathered_questions), 76 | ]: 77 | dist.all_gather_object(container, obj) 78 | 79 | flatten_answers, flatten_questions = [], [] 80 | for answers_gpus, questions_gpus in zip(zip(*gathered_answers), zip(*gathered_questions)): 81 | for answers, questions in zip(answers_gpus, questions_gpus): 82 | flatten_answers.extend(answers) 83 | flatten_questions.extend(questions) 84 | 85 | else: 86 | flatten_answers, flatten_questions = tuple([item for sublist in container for item in sublist] 87 | for container in [self.cur_answers, self.cur_questions]) 88 | 89 | self.answers.append(flatten_answers[:self.n_data]) 90 | self.questions.append(flatten_questions[:self.n_data]) 91 | 92 | @torch.inference_mode(mode=True) 93 | def __call__(self, completions: List[str], questions: List[str]): 94 | expressions = extract_expressions(completions) 95 | 96 | self.cur_answers.append(expressions) 97 | self.cur_questions.append(questions) 98 | 99 | def get_metric(self, n_solution: int=3, reset=True): 100 | # [n_question, n_solution] 101 | answers = self.answers[:n_solution] 102 | # [n_question] 103 | questions = self.questions[:n_solution][0] 104 | 105 | pass_k = np.mean([is_passk(expressions, question) for expressions, question in zip(answers, questions)]) 106 | acc_majority = np.mean([is_majority(expressions, question) for expressions, question in zip(answers, questions)]) 107 | 108 | 109 | if reset: 110 | self.answers = [] 111 | self.questions = [] 112 | return pass_k, acc_majority 113 | 114 | 115 | def is_passk(expressions, question): 116 | return any(get_answer_label(expression, question) for expression in expressions) 117 | 118 | 119 | def is_majority(expressions, question): 120 | repres = [get_semantics(expr) for expr in expressions] 121 | final_repre = max(repres, key=repres.count) 122 | index = repres.index(final_repre) 123 | return get_answer_label(expressions[index], question) 124 | 125 | 126 | def get_semantics(expression): 127 | numbers = re.findall(r'\d+', expression) 128 | symbols = re.findall(r'[+\-\*\/]', expression) 129 | 130 | try: 131 | value = eval(expression) 132 | except: 133 | value = None 134 | 135 | value = str(value) 136 | if value[-2:] == '.0': 137 | value = value[:-2] 138 | return tuple(sorted(numbers) + sorted(symbols) + [f'value={value}']) 139 | 140 | 141 | -------------------------------------------------------------------------------- /utils/gsm8k/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/OVM/e0a68c253568666dea61ef6dac910f9baa59ebbb/utils/gsm8k/__init__.py -------------------------------------------------------------------------------- /utils/gsm8k/decoding.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import signal 3 | import torch 4 | import json 5 | import os 6 | import re 7 | 8 | 9 | # ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") 10 | ANS_RE = re.compile(r"The answer is (\-?[0-9\.\,]+)") 11 | INVALID_ANS = "[invalid]" 12 | 13 | 14 | 15 | def extract_answer(completion): 16 | match = ANS_RE.search(completion) 17 | if match: 18 | match_str = match.group(1).strip() 19 | st_str = standardize_value_str(match_str) 20 | try: eval(st_str); return st_str 21 | except: ... 22 | return INVALID_ANS 23 | 24 | def extract_answers(completions): 25 | return [extract_answer(completion) for completion in completions] 26 | 27 | def standardize_value_str(x): 28 | """Standardize numerical values""" 29 | y = x.replace(",", "") 30 | if '.' in y: 31 | y = y.rstrip('0') 32 | if y[-1] == '.': 33 | y = y[:-1] 34 | if not len(y): 35 | return INVALID_ANS 36 | if y[0] == '.': 37 | y = '0' + y 38 | if y[-1] == '%': 39 | y = str(eval(y[:-1]) / 100) 40 | return y.rstrip('.') 41 | 42 | def get_answer_label(response_answer, gt): 43 | if response_answer == INVALID_ANS: 44 | return INVALID_ANS 45 | return response_answer == gt 46 | 47 | 48 | 49 | # taken from 50 | # https://stackoverflow.com/questions/492519/timeout-on-a-function-call 51 | @contextmanager 52 | def timeout(duration, formula): 53 | def timeout_handler(signum, frame): 54 | raise Exception(f"'{formula}': timed out after {duration} seconds") 55 | 56 | signal.signal(signal.SIGALRM, timeout_handler) 57 | signal.alarm(duration) 58 | yield 59 | signal.alarm(0) 60 | 61 | 62 | def eval_with_timeout(formula, max_time=3): 63 | try: 64 | with timeout(max_time, formula): 65 | return round(eval(formula), ndigits=4) 66 | except Exception as e: 67 | signal.alarm(0) 68 | print(f"Warning: Failed to eval {formula}, exception: {e}") 69 | return None 70 | 71 | 72 | # refer to https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py 73 | def use_calculator(sample): 74 | if "<<" not in sample: 75 | return None 76 | 77 | parts = sample.split("<<") 78 | remaining = parts[-1] 79 | if ">>" in remaining: 80 | return None 81 | if "=" not in remaining: 82 | return None 83 | lhs = remaining.split("=")[0] 84 | lhs = lhs.replace(",", "") 85 | if any([x not in "0123456789*+-/.()" for x in lhs]): 86 | return None 87 | ans = eval_with_timeout(lhs) 88 | if remaining[-1] == '-' and ans is not None and ans < 0: 89 | ans = -ans 90 | return ans 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /utils/gsm8k/metrics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from typing import Optional, List, Dict, Set, Any, Union 5 | import torch.distributed as dist 6 | from utils.gsm8k.decoding import INVALID_ANS, extract_answers, get_answer_label 7 | 8 | 9 | 10 | class GeneratorAnswerAcc: 11 | def __init__(self, n_data: int): 12 | self.n_data = n_data 13 | 14 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1 15 | 16 | self.corrs = [] 17 | self.gather = False 18 | 19 | @torch.inference_mode(mode=True) 20 | def __call__(self, completions: List[str], gts: List[str]): 21 | answers = extract_answers(completions) 22 | 23 | corrs = [float(get_answer_label(answer, gt) == True) for answer, gt in zip(answers, gts)] 24 | 25 | self.corrs.append(corrs) 26 | 27 | def get_metric(self, reset=True): 28 | if not self.gather: 29 | if self.world_size != 1: 30 | gathered_corrs = [None] * self.world_size 31 | for obj, container in [ 32 | (self.corrs, gathered_corrs), 33 | ]: 34 | dist.all_gather_object(container, obj) 35 | 36 | flatten_corrs = [] 37 | for corrs_gpus in zip(*gathered_corrs): 38 | for corrs in corrs_gpus: 39 | flatten_corrs.extend(corrs) 40 | 41 | else: 42 | flatten_corrs = [item for sublist in self.corrs for item in sublist] 43 | 44 | self.corrs = flatten_corrs[:self.n_data] 45 | self.gather = True 46 | 47 | acc = (sum(self.corrs) / len(self.corrs)) 48 | 49 | if reset: 50 | self.corrs = [] 51 | self.gather = False 52 | return acc 53 | 54 | 55 | class MultiSamplingAnswerAcc: 56 | def __init__(self, n_data: int = None): 57 | self.n_data = n_data 58 | 59 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1 60 | 61 | self.answers = [] 62 | self.gts = [] 63 | 64 | def start_new_sol_epoch(self): 65 | self.cur_answers = [] 66 | self.cur_gts = [] 67 | 68 | def end_the_sol_epoch(self): 69 | 70 | if self.world_size != 1: 71 | gathered_answers, gathered_gts = tuple([None] * self.world_size for _ in range(2)) 72 | for obj, container in [ 73 | (self.cur_answers, gathered_answers), 74 | (self.cur_gts, gathered_gts), 75 | ]: 76 | dist.all_gather_object(container, obj) 77 | 78 | flatten_answers, flatten_gts = [], [] 79 | for answers_gpus, gts_gpus in zip(zip(*gathered_answers), zip(*gathered_gts)): 80 | for answers, gts in zip(answers_gpus, gts_gpus): 81 | flatten_answers.extend(answers) 82 | flatten_gts.extend(gts) 83 | 84 | else: 85 | flatten_answers, flatten_gts = tuple([item for sublist in container for item in sublist] 86 | for container in [self.cur_answers, self.cur_gts]) 87 | 88 | self.answers.append(flatten_answers[:self.n_data]) 89 | self.gts.append(flatten_gts[:self.n_data]) 90 | 91 | 92 | @torch.inference_mode(mode=True) 93 | def __call__(self, completions: List[str], gts: List[str]): 94 | answers = extract_answers(completions) 95 | 96 | answers = [float(a) if a != INVALID_ANS else float('nan') for a in answers] 97 | gts = [float(gt) for gt in gts] 98 | 99 | self.cur_answers.append(answers) 100 | self.cur_gts.append(gts) 101 | 102 | 103 | def get_metric(self, n_solution: int=3, reset=True): 104 | 105 | assert all(x == self.gts[0] for x in self.gts) 106 | 107 | # [n_question] 108 | gts = np.array(self.gts[0]) 109 | # [n_question, n_solution] 110 | answers = np.stack(self.answers[:n_solution], axis=1) 111 | # print('answers:', answers.shape) 112 | 113 | pass_k = (answers == gts.reshape((-1, 1))).any(1).mean(0) 114 | acc_majority = np.mean([is_majority(a, gt, ignore=float('nan')) for a, gt in zip(answers, gts)]) 115 | 116 | if reset: 117 | self.gts = [] 118 | self.answers = [] 119 | return pass_k, acc_majority 120 | 121 | 122 | 123 | def is_passk(answers, gt): 124 | return gt in answers 125 | 126 | def is_majority(answers, gt, ignore = INVALID_ANS): 127 | filter_answers = list(filter(lambda x: x!=ignore, answers)) 128 | final_answer = max(filter_answers, key=filter_answers.count) 129 | return final_answer == gt 130 | 131 | 132 | -------------------------------------------------------------------------------- /utils/optim.py: -------------------------------------------------------------------------------- 1 | from transformers import AdamW 2 | from transformers import get_scheduler 3 | import transformers 4 | from typing import Optional, List, Dict, Set, Any, Union 5 | from dataclasses import dataclass 6 | import os 7 | 8 | def get_optimizers(model: transformers.AutoModelForCausalLM, training_args: dataclass) -> Dict: 9 | no_decay = ["bias", "LayerNorm.weight"] 10 | optimizer_grouped_parameters = [ 11 | { 12 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 13 | "weight_decay": training_args.weight_decay, 14 | }, 15 | { 16 | "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 17 | "weight_decay": 0.0, 18 | }, 19 | ] 20 | 21 | optim = AdamW( 22 | optimizer_grouped_parameters, 23 | lr=training_args.learning_rate, 24 | # weight_decay=training_args.weight_decay 25 | ) 26 | lr_scheduler = get_scheduler( 27 | training_args.lr_scheduler_type, 28 | optimizer=optim, 29 | # num_warmup_steps=training_args.num_updating_warmup_steps_aggr_devices, 30 | # num_training_steps=training_args.num_updating_steps_aggr_devices, 31 | num_warmup_steps=training_args.num_updating_warmup_steps, 32 | num_training_steps=training_args.num_updating_steps, 33 | ) 34 | return optim, lr_scheduler 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /utils/states.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import json 4 | from dataclasses import dataclass 5 | import random 6 | import math 7 | import numpy as np 8 | from accelerate import Accelerator 9 | 10 | 11 | def set_deepspeed_config(accelerator: Accelerator, training_args: dataclass): 12 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 13 | accelerator.state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = training_args.per_device_train_batch_size 14 | accelerator.state.deepspeed_plugin.deepspeed_config['train_batch_size'] = training_args.per_device_train_batch_size * world_size * accelerator.gradient_accumulation_steps 15 | 16 | 17 | def set_training_states(data_module: dict, training_args: dataclass): 18 | set_num_steps_per_epoch(data_module, training_args) 19 | set_num_training_steps(training_args) 20 | set_num_updating_steps(training_args) 21 | set_num_eval_steps(training_args) 22 | set_per_eval_steps(training_args) 23 | set_num_warmup_steps(training_args) 24 | 25 | set_num_logging_steps(training_args) 26 | set_per_save_steps(training_args) 27 | 28 | print(f"+ [Training States] There are {training_args.num_training_steps} steps in total.") 29 | 30 | 31 | def set_num_steps_per_epoch(data_module: dict, training_args: dataclass): 32 | num_devices = int(os.environ.get("WORLD_SIZE", 1)) 33 | 34 | len_train_set_per_device = math.ceil(len(data_module["train_dataset"]) / num_devices) 35 | num_train_steps_per_device = math.ceil(len_train_set_per_device / training_args.per_device_train_batch_size) 36 | num_updating_steps_per_epoch = num_train_steps_per_device // training_args.gradient_accumulation_steps 37 | 38 | len_eval_set_per_device = math.ceil(len(data_module["val_dataset"]) / num_devices) if data_module["val_dataset"] is not None else None 39 | num_eval_steps_per_device = math.ceil(len_eval_set_per_device / training_args.per_device_eval_batch_size) if data_module["val_dataset"] is not None else None 40 | 41 | training_args.num_training_steps_per_epoch = num_train_steps_per_device 42 | training_args.num_updating_steps_per_epoch = num_updating_steps_per_epoch 43 | training_args.num_eval_steps_per_epoch = num_eval_steps_per_device 44 | 45 | def set_num_training_steps(training_args: dataclass): 46 | if training_args.max_steps != -1: 47 | num_training_steps = training_args.max_steps 48 | else: 49 | assert training_args.num_train_epoches != -1 50 | num_training_steps = training_args.num_training_steps_per_epoch * training_args.num_train_epoches 51 | num_training_steps_aggr_devices = num_training_steps * int(os.environ.get("WORLD_SIZE", 1)) 52 | 53 | training_args.num_training_steps = num_training_steps 54 | training_args.num_training_steps_aggr_devices = num_training_steps_aggr_devices 55 | 56 | def set_num_updating_steps(training_args: dataclass): 57 | num_updating_steps = training_args.num_training_steps // training_args.gradient_accumulation_steps 58 | num_updating_steps_aggr_devices = num_updating_steps * int(os.environ.get("WORLD_SIZE", 1)) 59 | 60 | training_args.num_updating_steps = num_updating_steps 61 | training_args.num_updating_steps_aggr_devices = num_updating_steps_aggr_devices 62 | 63 | 64 | def set_num_eval_steps(training_args: dataclass): 65 | training_args.num_eval_steps = training_args.num_eval_steps_per_epoch 66 | 67 | def set_per_eval_steps(training_args: dataclass): 68 | if training_args.eval_steps != -1: 69 | per_eval_steps = training_args.eval_steps 70 | else: 71 | assert training_args.eval_epoches != -1 72 | per_eval_steps = training_args.num_training_steps_per_epoch * training_args.eval_epoches 73 | 74 | training_args.per_eval_steps = per_eval_steps 75 | 76 | def set_num_warmup_steps(training_args: dataclass): 77 | # if training_args.warmup_steps != -1: 78 | # num_warmup_steps_forward = training_args.warmup_steps 79 | # else: 80 | # assert training_args.warmup_ratio != -1 81 | # num_warmup_steps_forward = int(training_args.num_training_steps * training_args.warmup_ratio) 82 | # num_updating_warmup_steps = num_warmup_steps_forward // training_args.gradient_accumulation_steps 83 | # num_updating_warmup_steps_aggr_devices = num_updating_warmup_steps * int(os.environ.get("WORLD_SIZE", 1)) 84 | if training_args.warmup_steps != -1: 85 | num_updating_warmup_steps = training_args.warmup_steps 86 | else: 87 | assert training_args.warmup_ratio != -1 88 | num_updating_warmup_steps = int(training_args.num_updating_steps * training_args.warmup_ratio) 89 | num_updating_warmup_steps_aggr_devices = num_updating_warmup_steps * int(os.environ.get("WORLD_SIZE", 1)) 90 | 91 | training_args.num_updating_warmup_steps = num_updating_warmup_steps 92 | training_args.num_updating_warmup_steps_aggr_devices = num_updating_warmup_steps_aggr_devices 93 | 94 | def set_num_logging_steps(training_args: dataclass): 95 | if training_args.logging_steps != -1: 96 | num_logging_steps = training_args.logging_steps 97 | else: 98 | assert training_args.logging_epoches != -1 99 | num_logging_steps = training_args.num_training_steps_per_epoch * training_args.logging_epoches 100 | 101 | training_args.num_logging_steps = num_logging_steps 102 | 103 | def set_per_save_steps(training_args: dataclass): 104 | if training_args.save_steps != -1: 105 | per_save_steps = training_args.save_steps 106 | else: 107 | assert training_args.save_epoches != -1 108 | per_save_steps = training_args.num_training_steps_per_epoch * training_args.save_epoches 109 | 110 | training_args.per_save_steps = per_save_steps 111 | 112 | 113 | def set_random_seed(seed: int): 114 | random.seed(seed) 115 | np.random.seed(seed) 116 | torch.manual_seed(seed) 117 | torch.cuda.manual_seed_all(seed) 118 | torch.cuda.manual_seed(seed) 119 | 120 | 121 | --------------------------------------------------------------------------------