├── README.md
├── configs
├── zero1.yaml
├── zero1_offload.yaml
├── zero2.yaml
└── zero2_offload.yaml
├── data
├── game24
│ ├── 24.csv
│ ├── model_generation
│ │ ├── mid
│ │ │ └── responses_n300_llama7b-2-ep2_samples.jsonl
│ │ └── train
│ │ │ ├── responses_n100_llama7b-2-ep2_process_samples.jsonl
│ │ │ └── responses_n100_llama7b-2-ep2_samples.jsonl
│ └── train.jsonl
└── gsm8k
│ ├── model_generation
│ ├── test
│ │ ├── responses_n300_mistral7b-ep2_samples.jsonl
│ │ └── responses_n500_llama7b-2-ep2_samples.jsonl
│ └── train
│ │ ├── responses_n100_llama7b-2-ep2_samples.jsonl
│ │ └── responses_n100_mistral7b-ep2_samples.jsonl
│ ├── test.jsonl
│ └── train.jsonl
├── eval_generator_by_step.py
├── eval_results
├── game24
│ ├── generator
│ │ └── mid
│ │ │ ├── metrics_llama7b-2-ep2.json
│ │ │ └── responses_llama7b-2-ep2_samples.jsonl
│ ├── generator_with_verifier
│ │ └── mid
│ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml20.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml20_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml20_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml25.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml25_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml25_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml50.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml50_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml50_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml25.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml25_41.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml25_43.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step50_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml20.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml20_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml20_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml40.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml40_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml40_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step80_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1
│ │ │ ├── metrics_%step20_llama7b-2-ep2-all_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2-all_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ ├── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ │ │ └── prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beaml5_43.json
│ │ │ ├── metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv
│ │ │ └── responses_%step20_llama7b-2-ep2_beaml5_samples.jsonl
│ └── verifier
│ │ └── mid
│ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1)_g(llama7b-2-ep2).json
│ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10)_g(llama7b-2-ep2).json
│ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2)_g(llama7b-2-ep2).json
│ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4)_g(llama7b-2-ep2).json
│ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6)_g(llama7b-2-ep2).json
│ │ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8)_g(llama7b-2-ep2).json
│ │ └── responses_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10)_g(llama7b-2-ep2)_samples.jsonl
└── gsm8k
│ ├── generator
│ ├── test
│ │ ├── all_metrics_n100_llama7b-2-ep2.csv
│ │ └── all_metrics_n300_mistral7b-ep2.csv
│ └── train
│ │ ├── all_metrics_n100_llama7b-2-ep2.csv
│ │ └── all_metrics_n100_mistral7b-ep2.csv
│ ├── generator_with_verifier
│ ├── test
│ │ ├── llama7b-2-ep2-n100-scahead-mse-lm-token
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam1.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam10.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam10_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam10_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam1_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam1_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam2.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam20.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam20_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam20_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam25.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam25_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam25_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam2_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam2_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam4.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam4_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam4_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam5.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam50.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam50_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam50_43.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam5_41.json
│ │ │ ├── metrics_%step100_llama7b-2-ep2_beam5_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam1.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam10.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam10_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam10_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam1_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam1_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam2.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam2_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam2_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam4.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam4_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam4_43.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam5.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam5_41.json
│ │ │ ├── metrics_%step20_llama7b-2-ep2_beam5_43.json
│ │ │ ├── responses_%step100_llama7b-2-ep2_beam50_samples.jsonl
│ │ │ └── responses_%step100_llama7b-2-ep2_beam5_samples.jsonl
│ │ └── mistral7b-ep2-n100-scahead-mse-lm-token
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam1.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam10.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam10_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam10_43.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam1_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam1_43.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam2.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam20.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam20_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam20_43.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam25.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam25_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam25_43.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam2_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam2_43.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam5.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam50.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam50_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam50_43.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam5_41.json
│ │ │ ├── metrics_%step100_mistral7b-ep2_beam5_43.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam1.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam10.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam10_41.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam10_43.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam1_41.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam1_43.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam2.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam2_41.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam2_43.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam4.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam4_41.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam4_43.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam5.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam5_41.json
│ │ │ ├── metrics_%step20_mistral7b-ep2_beam5_43.json
│ │ │ ├── metrics_g(mistral7b-ep2)_v(mistral7b-ep2-n100-scahead-mse-lm-token).csv
│ │ │ ├── responses_%step100_mistral7b-ep2_beam50_samples.jsonl
│ │ │ └── responses_%step100_mistral7b-ep2_beam5_samples.jsonl
│ └── train
│ │ └── llama7b-2-ep2-n100-scahead-mse-lm-token
│ │ └── metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token).csv
│ └── verifier
│ ├── test
│ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json
│ ├── metrics_v(mistral7b-ep2-n100-scahead-mse-lm-token)_g(mistral7b-ep2).json
│ ├── responses_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2)_samples.jsonl
│ └── responses_v(mistral7b-ep2-n100-scahead-mse-lm-token)_g(mistral7b-ep2)_samples.jsonl
│ └── train
│ ├── metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json
│ └── responses_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2)_samples.jsonl
├── eval_with_verifier.py
├── generate_paths_and_eval.py
├── requirements.txt
├── scripts
├── game24
│ ├── eval_step_beam.sh
│ ├── eval_with_verifier.sh
│ ├── generate.sh
│ ├── greedy_eval.sh
│ ├── train_generator.sh
│ ├── train_verifier.sh
│ └── train_verifier_process.sh
└── gsm8k
│ ├── eval_step_beam.sh
│ ├── eval_with_verifier.sh
│ ├── generate.sh
│ ├── greedy_eval.sh
│ ├── train_generator.sh
│ └── train_verifier.sh
├── train_generator.py
├── train_verifier.py
└── utils
├── cached_models.py
├── constants.py
├── datasets.py
├── flash_attn_monkey_patch.py
├── game24
├── __init__.py
├── datasets.py
├── decoding.py
└── metrics.py
├── gsm8k
├── __init__.py
├── datasets.py
├── decoding.py
└── metrics.py
├── metrics.py
├── models.py
├── optim.py
├── sampling.py
├── states.py
└── verifier_models.py
/README.md:
--------------------------------------------------------------------------------
1 | # OVM, Outcome-supervised Value Models for Planning in Mathematical Reasoning
2 |
3 |
4 | Code, metrics, and models for the paper [Outcome-supervised Verifiers for Planning in Mathematical Reasoning](https://arxiv.org/pdf/2311.09724.pdf)
5 |
6 |
7 | The key technical implementations (`utils/sampling.py`):
8 |
9 | 1. **Value-guided beam search**: step-level beam search guided by a value model
10 |
11 | 2. **Allow batch generation with caculator using cache** (2-3 times faster than a naive implementation)
12 |
13 |
14 | ## Models and Data
15 |
16 | | Model | Dataset | Link |
17 | |----------------------|--------------|--------------------------|
18 | | OVM-Llama2-7B | GSM8K | [parameters](https://huggingface.co/FreedomIntelligence/OVM-llama2-7b) |
19 | | OVM-Mistral-7B | GSM8K | [parameters](https://huggingface.co/FreedomIntelligence/OVM-Mistral-7b) |
20 |
21 | See the training data of our value models (generated by the generators) in [dataset](https://huggingface.co/datasets/FreedomIntelligence/OVM-dataset)
22 |
23 | See the training data for Process Reward Models in GSM8K in [dataset](https://huggingface.co/datasets/FreedomIntelligence/OVM-process)
24 |
25 | ## Somethings for code
26 |
27 |
28 | 1. Directories
29 | - `configs`: for model training with `accelerate`
30 | - `data`: benchmark, and generator created data for training the value model
31 | - `eval_results`: metrics and responses
32 | - `generator`: generator-only (greedy, self-consistency, or pass@k)
33 | - `verifier`: ORM accuracy
34 | - `generator_with_verifier`: guided beam search, i.e. OVM and PRM
35 | - `scripts`: scripts for training and inference
36 | - `utils`: functions and classes
37 |
38 |
39 | 2. target_set
40 | - GSM8K: there are `train` and `test`, which corresponds to training set and test set respectively
41 | - Game of 24: there are `train` and `mid`
42 | - `train`: the first 900 problems
43 | - `mid`: problems index 901-1000
44 |
45 | 3. scripts for GSM8K and Game of 24 are similar. For simplicity, we only take GSM8K as the example below. You can simply run the same pipeline in Game of 24 by replacing `gsm8k` with `game24`
46 |
47 |
48 |
49 | ## Training
50 |
51 | ### Train the generator
52 |
53 | Training data for generator:
54 | - GSM8K: `data/gsm8k/train.jsonl`, from [OpenAI GSM8K](https://github.com/openai/grade-school-math/blob/master/grade_school_math/data/train.jsonl)
55 | - Game of 24: `data/game24/train.jsonl`, the first 900 problems in `data/game24/24.csv` (from [ToT](https://github.com/princeton-nlp/tree-of-thought-llm/blob/master/src/tot/data/24/24.csv)) with enumerated solutions
56 |
57 | To run the script `train_generator.sh` (under `scripts/gsm8k` or `scripts/game24`), you should first set `WANDB_API_KEY`, `WANDB_ENTITY`, `model_name_or_path`, `save_dir`. The generator is named by `save_generator_id`
58 |
59 | ```bash
60 | cd OVM
61 | bash scripts/gsm8k/train_generator.sh
62 | ```
63 |
64 |
65 | ### Train the OVM
66 |
67 | #### Generation
68 |
69 | First use the generator `generator_id` to generate `n_solutions` for each question in the training set,
70 | ```bash
71 | cd OVM
72 | bash scripts/gsm8k/generate.sh
73 | ```
74 | You should first config the path of your generator checkpoint `model_name_or_path`, and set `--target_set train`
75 |
76 | The output will be saved to `data/gsm8k/model_generation/`
77 |
78 |
79 | #### Training
80 |
81 | Train OVM using `train_verifier.sh`. First set `WANDB_API_KEY`, `WANDB_ENTITY`, `save_dir`, and `checkpoint_dir` (the path of generator checkpoint). The verifier is named with `save_verifier_id`
82 | ```bash
83 | cd OVM
84 | bash scripts/gsm8k/train_verifier.sh
85 | ```
86 |
87 |
88 |
89 | ## Inference
90 |
91 | ### Value-Guided Beam Search
92 |
93 | Config your generator checkpoint path `model_name_or_path` and verifier checkpoint path `verifier_model_name_or_path` in `eval_step_beam.sh`
94 | ```bash
95 | cd OVM
96 | bash scripts/gsm8k/eval_step_beam.sh
97 | ```
98 |
99 | (when `dedup_mode=1`, it will prioritize linguistically different candidates, which means when the sorted candidates are ['a', 'a', 'b', 'b', 'c'] it will select ['a', 'b', 'c'] rather than ['a', 'a', 'b'] if n_beam=3)
100 |
101 | The output will be saved to `eval_results/gsm8k/generator_with_verifier/test`
102 | (or `eval_results/game24/generator_with_verifier/mid`)
103 |
104 |
105 | ### Vanilla Sampling with ORM
106 |
107 | 1. First sample the data: config the generator checkpoint `model_name_or_path`, and set `--target_set test`
108 | ```bash
109 | cd OVM
110 | bash scripts/gsm8k/generate.sh
111 | ```
112 |
113 | 2. Then call ORM to score and rerank the samples: config the verifier checkpoint `verifier_model_name_or_path`
114 | ```bash
115 | cd OVM
116 | bash scripts/gsm8k/eval_with_verifier.sh
117 | ```
118 |
119 | The output will be saved to `eval_results/gsm8k/generator_with_verifier/test`
120 |
121 |
122 |
123 | ### Greedy
124 |
125 | Config your generator checkpoint path `model_name_or_path`
126 | ```bash
127 | cd OVM
128 | bash scripts/gsm8k/greedy_eval.sh
129 | ```
130 | The output will be saved to `eval_results/gsm8k/generator/test`
131 |
132 |
133 |
134 |
135 |
136 | ## Citation
137 | ```
138 | @misc{yu2023outcomesupervised,
139 | title={Outcome-supervised Verifiers for Planning in Mathematical Reasoning},
140 | author={Fei Yu and Anningzhe Gao and Benyou Wang},
141 | year={2023},
142 | eprint={2311.09724},
143 | archivePrefix={arXiv},
144 | primaryClass={cs.AI}
145 | }
146 | ```
147 |
148 | ## Star History
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
--------------------------------------------------------------------------------
/configs/zero1.yaml:
--------------------------------------------------------------------------------
1 | command_file: null
2 | commands: null
3 | compute_environment: LOCAL_MACHINE
4 | deepspeed_config:
5 | deepspeed_multinode_launcher: standard
6 | gradient_clipping: 1.0
7 | zero_stage: 1
8 | distributed_type: DEEPSPEED
9 | downcast_bf16: true
10 | dynamo_backend: 'NO'
11 | gpu_ids: null
12 | machine_rank: 0
13 | main_process_ip: null
14 | main_process_port: null
15 | main_training_function: main
16 | fsdp_config: {}
17 | megatron_lm_config: {}
18 | mixed_precision: bf16
19 | num_machines: 1
20 | num_processes: 8
21 | rdzv_backend: static
22 | same_network: true
23 | tpu_name: null
24 | tpu_zone: null
25 | use_cpu: false
26 |
--------------------------------------------------------------------------------
/configs/zero1_offload.yaml:
--------------------------------------------------------------------------------
1 | command_file: null
2 | commands: null
3 | compute_environment: LOCAL_MACHINE
4 | deepspeed_config:
5 | deepspeed_multinode_launcher: standard
6 | gradient_clipping: 1.0
7 | zero_stage: 1
8 | distributed_type: DEEPSPEED
9 | downcast_bf16: true
10 | dynamo_backend: 'NO'
11 | gpu_ids: null
12 | machine_rank: 0
13 | main_process_ip: null
14 | main_process_port: null
15 | main_training_function: main
16 | fsdp_config: {}
17 | megatron_lm_config: {}
18 | mixed_precision: bf16
19 | num_machines: 1
20 | num_processes: 8
21 | rdzv_backend: static
22 | same_network: true
23 | tpu_name: null
24 | tpu_zone: null
25 | use_cpu: true
26 |
--------------------------------------------------------------------------------
/configs/zero2.yaml:
--------------------------------------------------------------------------------
1 | command_file: null
2 | commands: null
3 | compute_environment: LOCAL_MACHINE
4 | deepspeed_config:
5 | deepspeed_multinode_launcher: standard
6 | gradient_clipping: 1.0
7 | zero_stage: 2
8 | distributed_type: DEEPSPEED
9 | downcast_bf16: true
10 | dynamo_backend: 'NO'
11 | gpu_ids: null
12 | machine_rank: 0
13 | main_process_ip: null
14 | main_process_port: null
15 | main_training_function: main
16 | fsdp_config: {}
17 | megatron_lm_config: {}
18 | mixed_precision: bf16
19 | num_machines: 1
20 | num_processes: 8
21 | rdzv_backend: static
22 | same_network: true
23 | tpu_name: null
24 | tpu_zone: null
25 | use_cpu: false
26 |
--------------------------------------------------------------------------------
/configs/zero2_offload.yaml:
--------------------------------------------------------------------------------
1 | command_file: null
2 | commands: null
3 | compute_environment: LOCAL_MACHINE
4 | deepspeed_config:
5 | deepspeed_multinode_launcher: standard
6 | gradient_clipping: 1.0
7 | offload_optimizer_device: cpu
8 | offload_param_device: cpu
9 | zero_stage: 2
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: true
12 | dynamo_backend: 'NO'
13 | gpu_ids: null
14 | machine_rank: 0
15 | main_process_ip: null
16 | main_process_port: null
17 | main_training_function: main
18 | fsdp_config: {}
19 | megatron_lm_config: {}
20 | mixed_precision: bf16
21 | num_machines: 1
22 | num_processes: 8
23 | rdzv_backend: static
24 | same_network: true
25 | tpu_name: null
26 | tpu_zone: null
27 | use_cpu: true
28 |
--------------------------------------------------------------------------------
/eval_results/game24/generator/mid/metrics_llama7b-2-ep2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.11
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator/mid/responses_llama7b-2-ep2_samples.jsonl:
--------------------------------------------------------------------------------
1 | {"idx": 0, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 4 5 6 10\n", "question": "4 5 6 10", "response": "5 + 6 = 11 (left: 4 10 11)\n10 + 11 = 21 (left: 4 21)\n4 + 21 = 25 (left: 25)\nThe answer is 4 + (10 + (5 + 6)) = 25", "response_answer": "4 + (10 + (5 + 6))", "label": false}
2 | {"idx": 1, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 2 4 7\n", "question": "1 2 4 7", "response": "1 + 7 = 8 (left: 2 4 8)\n4 + 8 = 12 (left: 2 12)\n2 * 12 = 24 (left: 24)\nThe answer is 2 * (4 + (1 + 7)) = 24", "response_answer": "2 * (4 + (1 + 7))", "label": true}
3 | {"idx": 2, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 2 5 8 11\n", "question": "2 5 8 11", "response": "2 - 5 = -3 (left: 8 11 -3)\n8 + 11 = 19 (left: -3 19)\n19 - -3 = 22 (left: 22)\nThe answer is (8 + 11) - (2 - 5) = 22", "response_answer": "(8 + 11) - (2 - 5)", "label": false}
4 | {"idx": 3, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 3 4 4 13\n", "question": "3 4 4 13", "response": "4 - 3 = 1 (left: 4 13 1)\n13 - 1 = 12 (left: 4 12)\n4 * 12 = 48 (left: 48)\nThe answer is 4 * (13 - (4 - 3)) = 48", "response_answer": "4 * (13 - (4 - 3))", "label": false}
5 | {"idx": 4, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 6 7 8 9\n", "question": "6 7 8 9", "response": "6 - 7 = -1 (left: 8 9 -1)\n8 + -1 = 7 (left: 9 7)\n9 + 7 = 16 (left: 16)\nThe answer is 9 + (8 + (6 - 7)) = 16", "response_answer": "9 + (8 + (6 - 7))", "label": false}
6 | {"idx": 5, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 11 11 13\n", "question": "1 11 11 13", "response": "11 + 13 = 24 (left: 1 11 24)\n1 - 11 = -10 (left: 24 -10)\n24 - -10 = 34 (left: 34)\nThe answer is (11 + 13) - (1 - 11) = 34", "response_answer": "(11 + 13) - (1 - 11)", "label": false}
7 | {"idx": 6, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 8 10 11\n", "question": "1 8 10 11", "response": "10 - 1 = 9 (left: 8 11 9)\n11 + 9 = 20 (left: 8 20)\n8 + 20 = 28 (left: 28)\nThe answer is 8 + (11 + (10 - 1)) = 28", "response_answer": "8 + (11 + (10 - 1))", "label": false}
8 | {"idx": 7, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 2 3 6 9\n", "question": "2 3 6 9", "response": "2 * 9 = 18 (left: 3 6 18)\n3 + 18 = 21 (left: 6 21)\n6 + 21 = 27 (left: 27)\nThe answer is 6 + (3 + (2 * 9)) = 27", "response_answer": "6 + (3 + (2 * 9))", "label": false}
9 | {"idx": 8, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 1 3 5 9\n", "question": "1 3 5 9", "response": "1 * 5 = 5 (left: 3 9 5)\n9 - 5 = 4 (left: 3 4)\n3 * 4 = 12 (left: 12)\nThe answer is 3 * (9 - (1 * 5)) = 12", "response_answer": "3 * (9 - (1 * 5))", "label": false}
10 | {"idx": 9, "input": "Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.\nInput: 3 3 7 12\n", "question": "3 3 7 12", "response": "3 + 7 = 10 (left: 3 12 10)\n12 + 10 = 22 (left: 3 22)\n3 + 22 = 25 (left: 25)\nThe answer is 3 + (12 + (3 + 7)) = 25", "response_answer": "3 + (12 + (3 + 7))", "label": false}
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.36
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.65
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.59
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.35
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.34
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.52
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.49
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.45
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.68
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.69
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.65
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.67
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.41
4 | 15,0.56
5 | 20,0.63
6 | 25,0.69
7 | 30,0.73
8 | 35,0.78
9 | 40,0.81
10 | 45,0.86
11 | 50,0.87
12 | 55,0.89
13 | 60,0.9
14 | 65,0.9
15 | 70,0.9
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.84
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.98
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.98
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.97
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.85
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.84
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml20.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml20_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml20_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.97
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml25.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.98
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml25_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 1.0
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml25_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.97
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.88
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.88
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.91
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.91
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.93
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml50.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.96
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml50_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml50_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step100_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.81
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.81
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.86
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.94
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.85
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.83
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.87
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml25.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.9
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml25_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.9
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml25_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.9
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.88
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.87
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.9
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.89
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step50_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.9
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.83
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.98
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.97
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.97
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.84
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.88
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.88
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml20.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.96
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml20_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.96
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml20_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.96
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.9
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.88
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml40.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml40_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml40_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.95
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.93
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.92
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.93
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step80_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.91
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.41
4 | 15,0.56
5 | 20,0.62
6 | 25,0.68
7 | 30,0.72
8 | 35,0.77
9 | 40,0.8
10 | 45,0.85
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.5
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.64
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.46
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.5
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.58
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.62
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.64
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.41
4 | 15,0.56
5 | 20,0.63
6 | 25,0.69
7 | 30,0.73
8 | 35,0.78
9 | 40,0.81
10 | 45,0.86
11 | 50,0.87
12 | 55,0.89
13 | 60,0.9
14 | 65,0.9
15 | 70,0.9
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.64
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.68
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.54
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.61
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.67
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.41
4 | 15,0.56
5 | 20,0.63
6 | 25,0.69
7 | 30,0.73
8 | 35,0.78
9 | 40,0.81
10 | 45,0.86
11 | 50,0.87
12 | 55,0.89
13 | 60,0.9
14 | 65,0.9
15 | 70,0.9
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.68
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.65
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.69
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.81
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.41
4 | 15,0.56
5 | 20,0.62
6 | 25,0.68
7 | 30,0.72
8 | 35,0.77
9 | 40,0.8
10 | 45,0.85
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.91
17 | 80,0.91
18 | 85,0.91
19 | 90,0.92
20 | 95,0.93
21 | 100,0.94
22 | 105,0.94
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.41
4 | 15,0.56
5 | 20,0.62
6 | 25,0.68
7 | 30,0.72
8 | 35,0.77
9 | 40,0.8
10 | 45,0.85
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.91
17 | 80,0.91
18 | 85,0.91
19 | 90,0.92
20 | 95,0.93
21 | 100,0.94
22 | 105,0.94
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.13
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.48
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.58
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.48
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.19
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.18
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.23
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.28
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.21
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.34
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.37
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.41
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.45
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.46
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.41
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.38
4 | 15,0.54
5 | 20,0.6
6 | 25,0.68
7 | 30,0.72
8 | 35,0.76
9 | 40,0.8
10 | 45,0.86
11 | 50,0.87
12 | 55,0.89
13 | 60,0.9
14 | 65,0.9
15 | 70,0.9
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.94
23 | 110,0.94
24 | 115,0.94
25 | 120,0.94
26 | 125,0.94
27 | 130,0.94
28 | 135,0.94
29 | 140,0.94
30 | 145,0.94
31 | 150,0.94
32 | 155,0.94
33 | 160,0.96
34 | 165,0.96
35 | 170,0.96
36 | 175,0.96
37 | 180,0.96
38 | 185,0.96
39 | 190,0.96
40 | 195,0.97
41 | 200,0.97
42 | 205,0.97
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.07
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.51
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.58
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.51
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.05
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.06
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.09
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.13
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.07
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.26
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.25
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.21
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.33
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.23
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.27
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.37
4 | 15,0.52
5 | 20,0.6
6 | 25,0.68
7 | 30,0.73
8 | 35,0.77
9 | 40,0.8
10 | 45,0.86
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.91
17 | 80,0.91
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.07
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.51
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.55
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.48
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.1
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.07
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.13
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.16
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.13
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.28
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.26
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.28
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.3
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.35
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.33
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.37
4 | 15,0.52
5 | 20,0.59
6 | 25,0.68
7 | 30,0.72
8 | 35,0.76
9 | 40,0.8
10 | 45,0.85
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.94
23 | 110,0.94
24 | 115,0.94
25 | 120,0.94
26 | 125,0.94
27 | 130,0.94
28 | 135,0.94
29 | 140,0.94
30 | 145,0.94
31 | 150,0.94
32 | 155,0.94
33 | 160,0.96
34 | 165,0.96
35 | 170,0.96
36 | 175,0.96
37 | 180,0.96
38 | 185,0.96
39 | 190,0.96
40 | 195,0.97
41 | 200,0.97
42 | 205,0.97
43 | 210,0.97
44 | 215,0.97
45 | 220,0.97
46 | 225,0.97
47 | 230,0.97
48 | 235,0.97
49 | 240,0.97
50 | 245,0.98
51 | 250,0.98
52 | 255,0.98
53 | 260,0.98
54 | 265,0.98
55 | 270,0.98
56 | 275,0.98
57 | 280,0.98
58 | 285,0.98
59 | 290,0.98
60 | 295,0.98
61 | 300,0.98
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.09
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.58
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.56
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.54
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.1
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.1
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.17
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.22
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.18
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.29
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.26
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.29
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.37
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.43
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.42
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.4
4 | 15,0.55
5 | 20,0.62
6 | 25,0.69
7 | 30,0.73
8 | 35,0.78
9 | 40,0.8
10 | 45,0.86
11 | 50,0.86
12 | 55,0.88
13 | 60,0.9
14 | 65,0.9
15 | 70,0.9
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.94
23 | 110,0.94
24 | 115,0.94
25 | 120,0.94
26 | 125,0.94
27 | 130,0.94
28 | 135,0.94
29 | 140,0.94
30 | 145,0.94
31 | 150,0.94
32 | 155,0.94
33 | 160,0.96
34 | 165,0.96
35 | 170,0.96
36 | 175,0.96
37 | 180,0.96
38 | 185,0.96
39 | 190,0.96
40 | 195,0.97
41 | 200,0.97
42 | 205,0.97
43 | 210,0.97
44 | 215,0.97
45 | 220,0.97
46 | 225,0.97
47 | 230,0.97
48 | 235,0.97
49 | 240,0.97
50 | 245,0.98
51 | 250,0.98
52 | 255,0.98
53 | 260,0.98
54 | 265,0.98
55 | 270,0.98
56 | 275,0.98
57 | 280,0.98
58 | 285,0.98
59 | 290,0.98
60 | 295,0.98
61 | 300,0.98
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.07
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.47
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.43
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.37
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.05
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.08
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.13
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.09
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.14
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.22
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.19
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.21
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.25
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.29
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.29
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.38
4 | 15,0.53
5 | 20,0.6
6 | 25,0.68
7 | 30,0.73
8 | 35,0.77
9 | 40,0.8
10 | 45,0.86
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.91
17 | 80,0.91
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.09
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.55
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.53
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.55
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.08
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.1
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.13
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.14
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.18
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.23
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.34
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.31
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.24
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.36
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.32
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_g(llama7b-2-ep2)_v(easyprocess2-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.37
4 | 15,0.52
5 | 20,0.59
6 | 25,0.68
7 | 30,0.73
8 | 35,0.77
9 | 40,0.8
10 | 45,0.86
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.91
17 | 80,0.91
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2-all_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.54
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2-all_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.48
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.67
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.49
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.53
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.55
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.59
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.64
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.65
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.64
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.68
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.63
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.67
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.38
4 | 15,0.54
5 | 20,0.61
6 | 25,0.68
7 | 30,0.72
8 | 35,0.76
9 | 40,0.8
10 | 45,0.86
11 | 50,0.87
12 | 55,0.89
13 | 60,0.9
14 | 65,0.9
15 | 70,0.9
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.94
22 | 105,0.94
23 | 110,0.94
24 | 115,0.94
25 | 120,0.94
26 | 125,0.94
27 | 130,0.94
28 | 135,0.94
29 | 140,0.94
30 | 145,0.94
31 | 150,0.94
32 | 155,0.94
33 | 160,0.96
34 | 165,0.96
35 | 170,0.96
36 | 175,0.96
37 | 180,0.96
38 | 185,0.96
39 | 190,0.96
40 | 195,0.97
41 | 200,0.97
42 | 205,0.97
43 | 210,0.97
44 | 215,0.97
45 | 220,0.97
46 | 225,0.97
47 | 230,0.97
48 | 235,0.97
49 | 240,0.97
50 | 245,0.98
51 | 250,0.98
52 | 255,0.98
53 | 260,0.98
54 | 265,0.98
55 | 270,0.98
56 | 275,0.98
57 | 280,0.98
58 | 285,0.98
59 | 290,0.98
60 | 295,0.98
61 | 300,0.98
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.68
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.67
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.82
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.39
4 | 15,0.54
5 | 20,0.62
6 | 25,0.69
7 | 30,0.73
8 | 35,0.77
9 | 40,0.8
10 | 45,0.85
11 | 50,0.85
12 | 55,0.87
13 | 60,0.88
14 | 65,0.88
15 | 70,0.88
16 | 75,0.91
17 | 80,0.91
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.62
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.63
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.81
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.37
4 | 15,0.52
5 | 20,0.59
6 | 25,0.68
7 | 30,0.72
8 | 35,0.75
9 | 40,0.78
10 | 45,0.84
11 | 50,0.84
12 | 55,0.86
13 | 60,0.88
14 | 65,0.89
15 | 70,0.89
16 | 75,0.91
17 | 80,0.91
18 | 85,0.91
19 | 90,0.92
20 | 95,0.93
21 | 100,0.94
22 | 105,0.93
23 | 110,0.94
24 | 115,0.94
25 | 120,0.94
26 | 125,0.94
27 | 130,0.94
28 | 135,0.94
29 | 140,0.94
30 | 145,0.94
31 | 150,0.94
32 | 155,0.94
33 | 160,0.96
34 | 165,0.96
35 | 170,0.96
36 | 175,0.96
37 | 180,0.96
38 | 185,0.96
39 | 190,0.96
40 | 195,0.97
41 | 200,0.97
42 | 205,0.97
43 | 210,0.97
44 | 215,0.97
45 | 220,0.97
46 | 225,0.97
47 | 230,0.97
48 | 235,0.97
49 | 240,0.97
50 | 245,0.98
51 | 250,0.98
52 | 255,0.98
53 | 260,0.98
54 | 265,0.98
55 | 270,0.98
56 | 275,0.98
57 | 280,0.98
58 | 285,0.98
59 | 290,0.98
60 | 295,0.98
61 | 300,0.98
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.69
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.64
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.82
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.84
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.38
4 | 15,0.53
5 | 20,0.6
6 | 25,0.69
7 | 30,0.73
8 | 35,0.77
9 | 40,0.8
10 | 45,0.85
11 | 50,0.85
12 | 55,0.87
13 | 60,0.88
14 | 65,0.88
15 | 70,0.88
16 | 75,0.91
17 | 80,0.91
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.65
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.69
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.72
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.85
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.79
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.74
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.81
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.86
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.39
4 | 15,0.54
5 | 20,0.62
6 | 25,0.69
7 | 30,0.73
8 | 35,0.77
9 | 40,0.8
10 | 45,0.85
11 | 50,0.86
12 | 55,0.88
13 | 60,0.89
14 | 65,0.89
15 | 70,0.89
16 | 75,0.92
17 | 80,0.92
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.66
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.56
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.82
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.71
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.75
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.77
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.73
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.78
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_%step20_llama7b-2-ep2_beaml5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.76
3 | }
--------------------------------------------------------------------------------
/eval_results/game24/generator_with_verifier/mid/prm-o-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8/metrics_g(llama7b-2-ep2)_v(easyprocess2progress-llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.25
3 | 10,0.38
4 | 15,0.54
5 | 20,0.61
6 | 25,0.68
7 | 30,0.73
8 | 35,0.76
9 | 40,0.79
10 | 45,0.85
11 | 50,0.85
12 | 55,0.87
13 | 60,0.88
14 | 65,0.88
15 | 70,0.88
16 | 75,0.91
17 | 80,0.91
18 | 85,0.92
19 | 90,0.93
20 | 95,0.94
21 | 100,0.95
22 | 105,0.95
23 | 110,0.95
24 | 115,0.95
25 | 120,0.95
26 | 125,0.95
27 | 130,0.95
28 | 135,0.95
29 | 140,0.95
30 | 145,0.95
31 | 150,0.95
32 | 155,0.95
33 | 160,0.97
34 | 165,0.97
35 | 170,0.97
36 | 175,0.97
37 | 180,0.97
38 | 185,0.97
39 | 190,0.97
40 | 195,0.98
41 | 200,0.98
42 | 205,0.98
43 | 210,0.98
44 | 215,0.98
45 | 220,0.98
46 | 225,0.98
47 | 230,0.98
48 | 235,0.98
49 | 240,0.98
50 | 245,0.99
51 | 250,0.99
52 | 255,0.99
53 | 260,0.99
54 | 265,0.99
55 | 270,0.99
56 | 275,0.99
57 | 280,0.99
58 | 285,0.99
59 | 290,0.99
60 | 295,0.99
61 | 300,0.99
62 |
--------------------------------------------------------------------------------
/eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep1)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 100,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 30000,
5 | "accuracy": 0.9986666666666667,
6 | "mpk": {
7 | "mp1": {
8 | "mpk": 0.99,
9 | "mpk_adapt": 1.0,
10 | "filter_mpk": 1.0,
11 | "n_example_filter": 99
12 | },
13 | "mp3": {
14 | "mpk": 0.9733333333333334,
15 | "mpk_adapt": 1.0,
16 | "filter_mpk": 1.0,
17 | "n_example_filter": 96
18 | },
19 | "mp5": {
20 | "mpk": 0.96,
21 | "mpk_adapt": 0.997979797979798,
22 | "filter_mpk": 0.9978723404255319,
23 | "n_example_filter": 94
24 | },
25 | "mp10": {
26 | "mpk": 0.888,
27 | "mpk_adapt": 0.9961840628507295,
28 | "filter_mpk": 0.9986111111111112,
29 | "n_example_filter": 72
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep10)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 100,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 30000,
5 | "accuracy": 0.9993,
6 | "mpk": {
7 | "mp1": {
8 | "mpk": 0.99,
9 | "mpk_adapt": 1.0,
10 | "filter_mpk": 1.0,
11 | "n_example_filter": 99
12 | },
13 | "mp3": {
14 | "mpk": 0.9733333333333334,
15 | "mpk_adapt": 1.0,
16 | "filter_mpk": 1.0,
17 | "n_example_filter": 96
18 | },
19 | "mp5": {
20 | "mpk": 0.9620000000000001,
21 | "mpk_adapt": 1.0,
22 | "filter_mpk": 1.0,
23 | "n_example_filter": 94
24 | },
25 | "mp10": {
26 | "mpk": 0.8889999999999999,
27 | "mpk_adapt": 0.9988776655443322,
28 | "filter_mpk": 1.0,
29 | "n_example_filter": 72
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep2)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 100,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 30000,
5 | "accuracy": 0.9986666666666667,
6 | "mpk": {
7 | "mp1": {
8 | "mpk": 0.99,
9 | "mpk_adapt": 1.0,
10 | "filter_mpk": 1.0,
11 | "n_example_filter": 99
12 | },
13 | "mp3": {
14 | "mpk": 0.9733333333333334,
15 | "mpk_adapt": 1.0,
16 | "filter_mpk": 1.0,
17 | "n_example_filter": 96
18 | },
19 | "mp5": {
20 | "mpk": 0.9620000000000001,
21 | "mpk_adapt": 1.0,
22 | "filter_mpk": 1.0,
23 | "n_example_filter": 94
24 | },
25 | "mp10": {
26 | "mpk": 0.8889999999999999,
27 | "mpk_adapt": 0.9988776655443322,
28 | "filter_mpk": 1.0,
29 | "n_example_filter": 72
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep4)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 100,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 30000,
5 | "accuracy": 0.9989,
6 | "mpk": {
7 | "mp1": {
8 | "mpk": 0.99,
9 | "mpk_adapt": 1.0,
10 | "filter_mpk": 1.0,
11 | "n_example_filter": 99
12 | },
13 | "mp3": {
14 | "mpk": 0.9733333333333334,
15 | "mpk_adapt": 1.0,
16 | "filter_mpk": 1.0,
17 | "n_example_filter": 96
18 | },
19 | "mp5": {
20 | "mpk": 0.9620000000000001,
21 | "mpk_adapt": 1.0,
22 | "filter_mpk": 1.0,
23 | "n_example_filter": 94
24 | },
25 | "mp10": {
26 | "mpk": 0.8889999999999999,
27 | "mpk_adapt": 0.9971941638608306,
28 | "filter_mpk": 1.0,
29 | "n_example_filter": 72
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep6)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 100,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 30000,
5 | "accuracy": 0.9991333333333333,
6 | "mpk": {
7 | "mp1": {
8 | "mpk": 0.99,
9 | "mpk_adapt": 1.0,
10 | "filter_mpk": 1.0,
11 | "n_example_filter": 99
12 | },
13 | "mp3": {
14 | "mpk": 0.9733333333333334,
15 | "mpk_adapt": 1.0,
16 | "filter_mpk": 1.0,
17 | "n_example_filter": 96
18 | },
19 | "mp5": {
20 | "mpk": 0.96,
21 | "mpk_adapt": 0.997979797979798,
22 | "filter_mpk": 0.9978723404255319,
23 | "n_example_filter": 94
24 | },
25 | "mp10": {
26 | "mpk": 0.888,
27 | "mpk_adapt": 0.9961840628507295,
28 | "filter_mpk": 0.9986111111111112,
29 | "n_example_filter": 72
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/eval_results/game24/verifier/mid/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token-bsz128-ep8)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 100,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 30000,
5 | "accuracy": 0.9993,
6 | "mpk": {
7 | "mp1": {
8 | "mpk": 0.99,
9 | "mpk_adapt": 1.0,
10 | "filter_mpk": 1.0,
11 | "n_example_filter": 99
12 | },
13 | "mp3": {
14 | "mpk": 0.97,
15 | "mpk_adapt": 0.9966329966329966,
16 | "filter_mpk": 0.9965277777777778,
17 | "n_example_filter": 96
18 | },
19 | "mp5": {
20 | "mpk": 0.96,
21 | "mpk_adapt": 0.997979797979798,
22 | "filter_mpk": 0.9978723404255319,
23 | "n_example_filter": 94
24 | },
25 | "mp10": {
26 | "mpk": 0.8889999999999999,
27 | "mpk_adapt": 0.9973063973063974,
28 | "filter_mpk": 0.9986111111111112,
29 | "n_example_filter": 72
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator/test/all_metrics_n100_llama7b-2-ep2.csv:
--------------------------------------------------------------------------------
1 | n_solution,pass@k,pass@majority
2 | 5,0.6292645943896892,0.41546626231993933
3 | 10,0.7247915087187263,0.4890068233510235
4 | 15,0.7922668688400303,0.514783927217589
5 | 20,0.8233510235026535,0.5291887793783169
6 | 25,0.844579226686884,0.5413191811978771
7 | 30,0.8597422289613343,0.5511751326762699
8 | 35,0.8688400303260045,0.558756633813495
9 | 40,0.8832448824867324,0.5640636846095527
10 | 45,0.88855193328279,0.5678544351781653
11 | 50,0.8961334344200151,0.5595147839272175
12 | 55,0.9044730856709629,0.5625473843821076
13 | 60,0.910538286580743,0.5678544351781653
14 | 65,0.9173616376042456,0.5701288855193328
15 | 70,0.9226686884003032,0.5724033358605004
16 | 75,0.9249431387414708,0.574677786201668
17 | 80,0.9302501895375285,0.5769522365428355
18 | 85,0.931008339651251,0.5769522365428355
19 | 90,0.9347990902198635,0.5799848369977255
20 | 95,0.9370735405610311,0.5716451857467779
21 | 100,0.9438968915845337,0.5731614859742229
22 |
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator/test/all_metrics_n300_mistral7b-ep2.csv:
--------------------------------------------------------------------------------
1 | n_solution,pass@k,pass@majority
2 | 5,0.7907505686125853,0.6322971948445792
3 | 10,0.8673237300985596,0.6679302501895376
4 | 15,0.8991660348749052,0.6853677028051555
5 | 20,0.9166034874905231,0.6974981046247157
6 | 25,0.9287338893100834,0.6952236542835482
7 | 30,0.9370735405610311,0.7035633055344959
8 | 35,0.9446550416982562,0.7073540561031084
9 | 40,0.9492039423805914,0.711144806671721
10 | 45,0.9514783927217589,0.7088703563305534
11 | 50,0.956027293404094,0.7156937073540561
12 | 55,0.959059893858984,0.7179681576952237
13 | 60,0.9628506444275967,0.7194844579226687
14 | 65,0.9666413949962093,0.7240333586050038
15 | 70,0.9673995451099318,0.7225170583775588
16 | 75,0.9681576952236542,0.7156937073540561
17 | 80,0.9704321455648218,0.7194844579226687
18 | 85,0.9719484457922669,0.7240333586050038
19 | 90,0.9727065959059894,0.7255496588324488
20 | 95,0.9742228961334344,0.7232752084912812
21 | 100,0.976497346474602,0.7270659590598939
22 | 105,0.9772554965883244,0.7255496588324488
23 | 110,0.978013646702047,0.7247915087187263
24 | 115,0.978013646702047,0.7263078089461713
25 | 120,0.978013646702047,0.7285822592873389
26 | 125,0.979529946929492,0.7308567096285065
27 | 130,0.9818043972706596,0.7308567096285065
28 | 135,0.9818043972706596,0.733131159969674
29 | 140,0.9833206974981046,0.7293404094010614
30 | 145,0.9833206974981046,0.7293404094010614
31 | 150,0.9840788476118272,0.7278241091736164
32 | 155,0.9848369977255497,0.7285822592873389
33 | 160,0.9848369977255497,0.730098559514784
34 | 165,0.9848369977255497,0.730098559514784
35 | 170,0.9848369977255497,0.730098559514784
36 | 175,0.9848369977255497,0.731614859742229
37 | 180,0.9848369977255497,0.7308567096285065
38 | 185,0.9855951478392722,0.7308567096285065
39 | 190,0.9855951478392722,0.7293404094010614
40 | 195,0.9871114480667172,0.730098559514784
41 | 200,0.9878695981804397,0.7308567096285065
42 | 205,0.9878695981804397,0.730098559514784
43 | 210,0.9886277482941622,0.7308567096285065
44 | 215,0.9893858984078847,0.733131159969674
45 | 220,0.9893858984078847,0.7338893100833965
46 | 225,0.9901440485216073,0.7338893100833965
47 | 230,0.9901440485216073,0.7338893100833965
48 | 235,0.9901440485216073,0.7323730098559514
49 | 240,0.9901440485216073,0.7323730098559514
50 | 245,0.9901440485216073,0.7293404094010614
51 | 250,0.9901440485216073,0.7285822592873389
52 | 255,0.9901440485216073,0.7263078089461713
53 | 260,0.9901440485216073,0.7270659590598939
54 | 265,0.9901440485216073,0.7285822592873389
55 | 270,0.9909021986353298,0.730098559514784
56 | 275,0.9909021986353298,0.7285822592873389
57 | 280,0.9916603487490523,0.7285822592873389
58 | 285,0.9916603487490523,0.7278241091736164
59 | 290,0.9916603487490523,0.7285822592873389
60 | 295,0.9916603487490523,0.7285822592873389
61 | 300,0.9916603487490523,0.7270659590598939
62 |
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator/train/all_metrics_n100_llama7b-2-ep2.csv:
--------------------------------------------------------------------------------
1 | n_solution,pass@k,pass@majority
2 | 5,0.6716178241670012,0.48467817476247826
3 | 10,0.7707747892412685,0.5517195236183594
4 | 15,0.8170748026227753,0.5727284892278871
5 | 20,0.8455774120165931,0.5878495918640438
6 | 25,0.8676568981667335,0.598420982202596
7 | 30,0.8823765556001606,0.6002943931486685
8 | 35,0.8954904322226682,0.6048441054462732
9 | 40,0.9039207814799947,0.6112672286899505
10 | 45,0.9114144252642847,0.6143449752442125
11 | 50,0.918506623845845,0.6144787903117891
12 | 55,0.9235915964137562,0.6176903519336278
13 | 60,0.9296132744547037,0.6200990231500066
14 | 65,0.9336277264820019,0.6191623176769704
15 | 70,0.9383112538471832,0.6215709888933494
16 | 75,0.9413890004014452,0.6213033587581962
17 | 80,0.9444667469557072,0.6213033587581962
18 | 85,0.9467416031045096,0.6246487354476113
19 | 90,0.9488826441857353,0.6235782149069985
20 | 95,0.9512913154021143,0.6246487354476113
21 | 100,0.9527632811454569,0.6245149203800348
22 |
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator/train/all_metrics_n100_mistral7b-ep2.csv:
--------------------------------------------------------------------------------
1 | n_solution,pass@k,pass@majority
2 | 5,0.8731433159373746,0.7490967482938579
3 | 10,0.9211829251973772,0.7903117891074535
4 | 15,0.9408537401311388,0.8030242205272313
5 | 20,0.9542352468887997,0.8085106382978723
6 | 25,0.9606583701324769,0.8157366519470093
7 | 30,0.9656095276328115,0.8194834738391543
8 | 35,0.9689549043222266,0.8222935902582631
9 | 40,0.9720326508764887,0.8237655560016057
10 | 45,0.9741736919577144,0.8243008162719122
11 | 50,0.9760471029037869,0.8272447477585976
12 | 55,0.9785895891877425,0.8293857888398234
13 | 60,0.9803291850662385,0.8267094874882912
14 | 65,0.9811320754716981,0.8283152682992105
15 | 70,0.9827378562826175,0.8299210491101298
16 | 75,0.9838083768232303,0.8305901244480128
17 | 80,0.9848788973638432,0.8313930148534725
18 | 85,0.9860832329720326,0.8320620901913556
19 | 90,0.9863508631071859,0.8321959052589322
20 | 95,0.9867523083099157,0.833266425799545
21 | 100,0.9871537535126456,0.8338016860698515
22 |
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6702047005307051
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.714177407126611
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7043214556482184
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7119029567854435
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6777862016679302
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6868840030326004
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6944655041698257
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam20.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7164518574677786
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam20_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7179681576952237
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam20_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7278241091736164
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam25.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7232752084912812
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam25_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7323730098559514
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam25_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7361637604245641
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6830932524639879
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6921910538286581
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7012888551933283
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7020470053070508
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7156937073540561
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7012888551933283
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam50.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7422289613343442
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam50_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7376800606520091
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam50_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7323730098559514
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6974981046247157
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step100_llama7b-2-ep2_beam5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7035633055344959
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6664139499620925
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6952236542835482
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6846095526914329
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.690674753601213
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6664139499620925
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6580742987111448
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6702047005307051
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6785443517816527
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6755117513267627
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6937073540561031
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.686125852918878
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.689158453373768
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6899166034874905
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6876421531463229
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_%step20_llama7b-2-ep2_beam5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.6868840030326004
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8112206216830933
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8385140257771039
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8483699772554966
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8392721758908264
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8180439727065959
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8074298711144806
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8218347232752085
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam20.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8430629264594389
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam20_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8438210765731615
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam20_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8415466262319939
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam25.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8476118271417741
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam25_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8506444275966641
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam25_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8438210765731615
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.819560272934041
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8188021228203184
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8332069749810462
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam50.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8521607278241091
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam50_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8415466262319939
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam50_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8468536770280516
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.824109173616376
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step100_mistral7b-ep2_beam5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8332069749810462
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam1.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8119787717968158
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam10.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8248673237300985
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam10_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8256254738438211
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam10_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8278999241849886
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam1_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.796057619408643
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam1_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.7945413191811979
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam2.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.819560272934041
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam2_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8119787717968158
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam2_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8188021228203184
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam4.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.819560272934041
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam4_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8248673237300985
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam4_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8188021228203184
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam5.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.824109173616376
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam5_41.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8233510235026535
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_%step20_mistral7b-ep2_beam5_43.json:
--------------------------------------------------------------------------------
1 | {
2 | "accuracy": 0.8150113722517058
3 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/test/mistral7b-ep2-n100-scahead-mse-lm-token/metrics_g(mistral7b-ep2)_v(mistral7b-ep2-n100-scahead-mse-lm-token).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.7407126611068992
3 | 10,0.7922668688400303
4 | 15,0.8157695223654283
5 | 20,0.821076573161486
6 | 25,0.8294162244124337
7 | 30,0.8332069749810462
8 | 35,0.8316906747536013
9 | 40,0.8347232752084913
10 | 45,0.8415466262319939
11 | 50,0.8430629264594389
12 | 55,0.8423047763457164
13 | 60,0.846095526914329
14 | 65,0.8476118271417741
15 | 70,0.8514025777103866
16 | 75,0.8514025777103866
17 | 80,0.849128127369219
18 | 85,0.8476118271417741
19 | 90,0.8468536770280516
20 | 95,0.8468536770280516
21 | 100,0.846095526914329
22 | 105,0.8453373768006065
23 | 110,0.8453373768006065
24 | 115,0.8468536770280516
25 | 120,0.846095526914329
26 | 125,0.849128127369219
27 | 130,0.8483699772554966
28 | 135,0.8483699772554966
29 | 140,0.8468536770280516
30 | 145,0.846095526914329
31 | 150,0.8483699772554966
32 | 155,0.8498862774829417
33 | 160,0.8498862774829417
34 | 165,0.8498862774829417
35 | 170,0.8506444275966641
36 | 175,0.8498862774829417
37 | 180,0.8514025777103866
38 | 185,0.8506444275966641
39 | 190,0.8506444275966641
40 | 195,0.849128127369219
41 | 200,0.849128127369219
42 | 205,0.8498862774829417
43 | 210,0.8498862774829417
44 | 215,0.8506444275966641
45 | 220,0.8514025777103866
46 | 225,0.8514025777103866
47 | 230,0.8529188779378317
48 | 235,0.8544351781652767
49 | 240,0.8567096285064443
50 | 245,0.8559514783927218
51 | 250,0.8559514783927218
52 | 255,0.8559514783927218
53 | 260,0.8559514783927218
54 | 265,0.8559514783927218
55 | 270,0.8551933282789992
56 | 275,0.8544351781652767
57 | 280,0.8559514783927218
58 | 285,0.8559514783927218
59 | 290,0.8559514783927218
60 | 295,0.8567096285064443
61 | 300,0.8567096285064443
62 |
--------------------------------------------------------------------------------
/eval_results/gsm8k/generator_with_verifier/train/llama7b-2-ep2-n100-scahead-mse-lm-token/metrics_g(llama7b-2-ep2)_v(llama7b-2-ep2-n100-scahead-mse-lm-token).csv:
--------------------------------------------------------------------------------
1 | n_solution,acc
2 | 5,0.63575538605647
3 | 10,0.7090860430884518
4 | 15,0.7371872072795397
5 | 20,0.7539140907266159
6 | 25,0.7664927070788171
7 | 30,0.7758597618091797
8 | 35,0.7825505151880102
9 | 40,0.7876354877559213
10 | 45,0.7928542753914091
11 | 50,0.7964672822159775
12 | 55,0.8003479191756991
13 | 60,0.8032918506623846
14 | 65,0.80623578214907
15 | 70,0.808644453365449
16 | 75,0.8094473437709087
17 | 80,0.809313528703332
18 | 85,0.8106516793790981
19 | 90,0.8105178643115215
20 | 95,0.8111869396494045
21 | 100,0.8127927204603238
22 |
--------------------------------------------------------------------------------
/eval_results/gsm8k/verifier/test/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 1319,
3 | "#solution_per_problem": 500,
4 | "#total_solutions": 659500,
5 | "accuracy": 0.9066353297952995,
6 | "mp1": 0.7293404094010614
7 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/verifier/test/metrics_v(mistral7b-ep2-n100-scahead-mse-lm-token)_g(mistral7b-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 1319,
3 | "#solution_per_problem": 300,
4 | "#total_solutions": 395700,
5 | "accuracy": 0.9070710133939853,
6 | "mp1": 0.8567096285064443
7 | }
--------------------------------------------------------------------------------
/eval_results/gsm8k/verifier/train/metrics_v(llama7b-2-ep2-n100-scahead-mse-lm-token)_g(llama7b-2-ep2).json:
--------------------------------------------------------------------------------
1 | {
2 | "#question": 7473,
3 | "#solution_per_problem": 100,
4 | "#total_solutions": 747300,
5 | "accuracy": 0.9572862304295464,
6 | "mp1": 0.8127927204603238
7 | }
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.13.1
2 | deepspeed==0.10.0
3 | accelerate==0.20.3
4 | transformers==4.34.1
5 | wandb==0.15.4
--------------------------------------------------------------------------------
/scripts/game24/eval_step_beam.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | verifier_id=n100-scahead-mse-lm-token-bsz128-ep10
5 |
6 | n_beam=4
7 | n_sampling_steps=20
8 |
9 |
10 | model_name_or_path=~/models/game24/generators/${generator_id}
11 | verifier_model_name_or_path=~/models/game24/verifiers/${generator_id}-${verifier_id}
12 |
13 |
14 | accelerate launch \
15 | --main_process_port=20659 \
16 | eval_generator_by_step.py \
17 | --model_name_or_path ${model_name_or_path} \
18 | --verifier_model_name_or_path ${verifier_model_name_or_path} \
19 | --dataset game24 \
20 | --data_dir data/game24 \
21 | --output_dir eval_results/game24/generator_with_verifier \
22 | --target_set mid \
23 | --inference_mode beam \
24 | --batch_size 30 \
25 | --vs_batch_size 64 \
26 | --n_beam ${n_beam} \
27 | --n_sampling_steps ${n_sampling_steps} \
28 | --max_n_step 10 \
29 | --max_step_length 100 \
30 | --dedup_mode 1 \
31 | --do_sample True \
32 | --temperature 0.7 \
33 | --top_k 50 \
34 | --top_p 1.0 \
35 | --max_new_tokens 400 \
36 | --seed 42
37 |
38 |
39 |
--------------------------------------------------------------------------------
/scripts/game24/eval_with_verifier.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | verifier_id=n100-scahead-mse-lm-token-bsz128-ep10
5 | verifier_model_name_or_path=~/models/game24/verifiers/${generator_id}-${verifier_id}
6 |
7 | accelerate launch \
8 | --main_process_port=29510 \
9 | eval_with_verifier.py \
10 | --model_name_or_path ${verifier_model_name_or_path} \
11 | --data_dir data/game24/model_generation \
12 | --verifier_output_dir eval_results/game24/verifier \
13 | --generator_metric_dir eval_results/game24/generator_with_verifier \
14 | --generator_id ${generator_id} \
15 | --target_set mid \
16 | --batch_size 64 \
17 | --seed 42
18 |
19 |
20 |
--------------------------------------------------------------------------------
/scripts/game24/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | n_solutions=100
5 |
6 | model_name_or_path=~/models/game24/generators/${generator_id}
7 |
8 | accelerate launch \
9 | --main_process_port=20658 \
10 | generate_paths_and_eval.py \
11 | --model_name_or_path ${model_name_or_path} \
12 | --dataset game24 \
13 | --data_dir data/game24 \
14 | --output_dir data/game24/model_generation \
15 | --metric_output_dir eval_results/game24/generator \
16 | --target_set train \
17 | --n_solutions ${n_solutions} \
18 | --batch_size 16 \
19 | --do_sample True \
20 | --temperature 0.7 \
21 | --top_k 50 \
22 | --top_p 1.0 \
23 | --max_new_tokens 400
24 |
--------------------------------------------------------------------------------
/scripts/game24/greedy_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | model_name_or_path=~/models/game24/generators/${generator_id}
5 |
6 | accelerate launch \
7 | --main_process_port=20659 \
8 | generate_paths_and_eval.py \
9 | --model_name_or_path ${model_name_or_path} \
10 | --dataset game24 \
11 | --data_dir data/game24 \
12 | --output_dir eval_results/game24/generator \
13 | --metric_output_dir eval_results/game24/generator \
14 | --target_set mid \
15 | --batch_size 32 \
16 | --do_sample False \
17 | --max_new_tokens 400 \
18 | --seed 42
19 |
--------------------------------------------------------------------------------
/scripts/game24/train_generator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_API_KEY=
4 | export WANDB_PROJECT=GAME24-Generator-Finetune
5 | export WANDB_ENTITY=
6 |
7 |
8 | model_name_or_path=meta-llama/Llama-2-7b-hf
9 | save_generator_id=llama7b-2-ep2
10 |
11 | save_dir=~/models/game24/generators/${save_generator_id}/
12 | export WANDB_NAME=${save_generator_id}
13 |
14 |
15 |
16 |
17 | accelerate launch \
18 | --config_file ./configs/zero1.yaml \
19 | --main_process_port=20650 \
20 | train_generator.py \
21 | --model_name_or_path ${model_name_or_path} \
22 | --dataset game24 \
23 | --data_dir data/game24 \
24 | --target_set train \
25 | --save_dir ${save_dir} \
26 | --num_train_epoches 2 \
27 | --eval_steps 200 \
28 | --per_device_train_batch_size 16 \
29 | --per_device_eval_batch_size 4 \
30 | --gradient_accumulation_steps 1 \
31 | --gradient_checkpointing True \
32 | --learning_rate 1e-5 \
33 | --weight_decay 0 \
34 | --lr_scheduler_type "linear" \
35 | --warmup_steps 0 \
36 | --save_steps 200 \
37 | --save_best False \
38 | --save_total_limit 0 \
39 | --logging_dir ./wandb \
40 | --logging_steps 8 \
41 | --seed 42
42 |
43 |
--------------------------------------------------------------------------------
/scripts/game24/train_verifier.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_API_KEY=
4 | export WANDB_PROJECT=GAME24-Verifier
5 | export WANDB_ENTITY=
6 |
7 |
8 | n_solution=100
9 | generator_id=llama7b-2-ep2
10 | save_verifier_id=n${n_solution}-scahead-mse-lm-token-bsz128-ep10
11 |
12 |
13 | checkpoint_dir=~/models/game24/generators/${generator_id}
14 |
15 | final_id=${generator_id}-${save_verifier_id}
16 | save_dir=~/models/game24/verifiers/${final_id}
17 | export WANDB_NAME=${final_id}
18 |
19 |
20 |
21 | accelerate launch \
22 | --config_file ./configs/zero1.yaml \
23 | --main_process_port=20104 \
24 | train_verifier.py \
25 | --model_name_or_path ${checkpoint_dir} \
26 | --data_dir data/game24/model_generation \
27 | --target_set train \
28 | --save_dir ${save_dir} \
29 | --generator_id ${generator_id} \
30 | --dedup True \
31 | --per_problem_sampling_solution ${n_solution} \
32 | --loss_level token \
33 | --loss_on_llm True \
34 | --num_train_epoches 10 \
35 | --eval_steps 1000 \
36 | --per_device_train_batch_size 16 \
37 | --per_device_eval_batch_size 64 \
38 | --gradient_accumulation_steps 1 \
39 | --gradient_checkpointing True \
40 | --learning_rate 1e-5 \
41 | --weight_decay 0 \
42 | --lr_scheduler_type "linear" \
43 | --warmup_steps 0 \
44 | --save_epoches 2 \
45 | --save_best False \
46 | --save_total_limit 10 \
47 | --logging_dir ./wandb \
48 | --logging_steps 20 \
49 | --seed 42
50 |
51 |
--------------------------------------------------------------------------------
/scripts/game24/train_verifier_process.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_API_KEY=
4 | export WANDB_PROJECT=GAME24-Verifier
5 | export WANDB_ENTITY=
6 |
7 |
8 | n_solution=100
9 | generator_id=llama7b-2-ep2
10 | save_verifier_id=n${n_solution}-scahead-mse-lm-token-bsz128-ep10
11 |
12 |
13 | checkpoint_dir=~/models/game24/generators/${generator_id}
14 |
15 | final_id=prm-${generator_id}-${save_verifier_id}
16 | save_dir=~/models/game24/verifiers/${final_id}
17 | export WANDB_NAME=${final_id}
18 |
19 |
20 |
21 | accelerate launch \
22 | --config_file ./configs/zero1.yaml \
23 | --main_process_port=20104 \
24 | train_verifier.py \
25 | --model_name_or_path ${checkpoint_dir} \
26 | --data_dir data/game24/model_generation \
27 | --target_set train \
28 | --save_dir ${save_dir} \
29 | --generator_id ${generator_id} \
30 | --dedup True \
31 | --per_problem_sampling_solution ${n_solution} \
32 | --loss_level token \
33 | --loss_on_llm True \
34 | --process True \
35 | --num_train_epoches 10 \
36 | --eval_steps 1000 \
37 | --per_device_train_batch_size 16 \
38 | --per_device_eval_batch_size 64 \
39 | --gradient_accumulation_steps 1 \
40 | --gradient_checkpointing True \
41 | --learning_rate 1e-5 \
42 | --weight_decay 0 \
43 | --lr_scheduler_type "linear" \
44 | --warmup_steps 0 \
45 | --save_epoches 2 \
46 | --save_best False \
47 | --save_total_limit 10 \
48 | --logging_dir ./wandb \
49 | --logging_steps 20 \
50 | --seed 42
51 |
52 |
--------------------------------------------------------------------------------
/scripts/gsm8k/eval_step_beam.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | verifier_id=n100-scahead-mse-lm-token
5 |
6 | n_beam=4
7 | n_sampling_steps=20
8 |
9 |
10 | model_name_or_path=~/models/gsm8k/generators/${generator_id}
11 | verifier_model_name_or_path=~/models/gsm8k/verifiers/${generator_id}-${verifier_id}
12 |
13 |
14 | accelerate launch \
15 | --main_process_port=20659 \
16 | eval_generator_by_step.py \
17 | --model_name_or_path ${model_name_or_path} \
18 | --verifier_model_name_or_path ${verifier_model_name_or_path} \
19 | --dataset gsm8k \
20 | --data_dir data/gsm8k \
21 | --output_dir eval_results/gsm8k/generator_with_verifier \
22 | --target_set test \
23 | --inference_mode beam \
24 | --batch_size 26 \
25 | --vs_batch_size 64 \
26 | --n_beam ${n_beam} \
27 | --n_sampling_steps ${n_sampling_steps} \
28 | --max_n_step 10 \
29 | --max_step_length 100 \
30 | --dedup_mode 0 \
31 | --do_sample True \
32 | --temperature 0.7 \
33 | --top_k 50 \
34 | --top_p 1.0 \
35 | --max_new_tokens 400 \
36 | --seed 42
37 |
--------------------------------------------------------------------------------
/scripts/gsm8k/eval_with_verifier.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | verifier_id=n100-scahead-mse-lm-token
5 | verifier_model_name_or_path=~/models/gsm8k/verifiers/${generator_id}-${verifier_id}
6 |
7 |
8 | accelerate launch \
9 | --main_process_port=29510 \
10 | eval_with_verifier.py \
11 | --model_name_or_path ${verifier_model_name_or_path} \
12 | --data_dir data/gsm8k/model_generation \
13 | --verifier_output_dir eval_results/gsm8k/verifier \
14 | --generator_metric_dir eval_results/gsm8k/generator_with_verifier \
15 | --generator_id ${generator_id} \
16 | --target_set test \
17 | --batch_size 64 \
18 | --seed 42
19 |
20 |
--------------------------------------------------------------------------------
/scripts/gsm8k/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | n_solutions=100
5 |
6 | model_name_or_path=~/models/gsm8k/generators/${generator_id}
7 |
8 | accelerate launch \
9 | --main_process_port=20658 \
10 | generate_paths_and_eval.py \
11 | --model_name_or_path ${model_name_or_path} \
12 | --dataset gsm8k \
13 | --data_dir data/gsm8k \
14 | --output_dir data/gsm8k/model_generation \
15 | --metric_output_dir eval_results/gsm8k/generator \
16 | --target_set train \
17 | --n_solutions ${n_solutions} \
18 | --batch_size 16 \
19 | --do_sample True \
20 | --temperature 0.7 \
21 | --top_k 50 \
22 | --top_p 1.0 \
23 | --max_new_tokens 400
24 |
--------------------------------------------------------------------------------
/scripts/gsm8k/greedy_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | generator_id=llama7b-2-ep2
4 | model_name_or_path=~/models/gsm8k/generators/${generator_id}
5 |
6 | accelerate launch \
7 | --main_process_port=20659 \
8 | generate_paths_and_eval.py \
9 | --model_name_or_path ${model_name_or_path} \
10 | --dataset gsm8k \
11 | --data_dir data/gsm8k \
12 | --output_dir eval_results/gsm8k/generator \
13 | --metric_output_dir eval_results/gsm8k/generator \
14 | --target_set test \
15 | --batch_size 32 \
16 | --do_sample False \
17 | --max_new_tokens 400 \
18 | --seed 42
19 |
--------------------------------------------------------------------------------
/scripts/gsm8k/train_generator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_API_KEY=
4 | export WANDB_PROJECT=GSM8K-Generator-Finetune
5 | export WANDB_ENTITY=
6 |
7 |
8 | model_name_or_path=meta-llama/Llama-2-7b-hf
9 | save_generator_id=llama7b-2-ep2
10 |
11 | save_dir=~/models/gsm8k/generators/${save_generator_id}/
12 | export WANDB_NAME=${save_generator_id}
13 |
14 |
15 |
16 |
17 | accelerate launch \
18 | --config_file ./configs/zero1.yaml \
19 | --main_process_port=20650 \
20 | train_generator.py \
21 | --model_name_or_path ${model_name_or_path} \
22 | --dataset gsm8k \
23 | --data_dir data/gsm8k \
24 | --target_set train \
25 | --save_dir ${save_dir} \
26 | --num_train_epoches 2 \
27 | --eval_steps 200 \
28 | --per_device_train_batch_size 16 \
29 | --per_device_eval_batch_size 4 \
30 | --gradient_accumulation_steps 1 \
31 | --gradient_checkpointing True \
32 | --learning_rate 1e-5 \
33 | --weight_decay 0 \
34 | --lr_scheduler_type "linear" \
35 | --warmup_steps 0 \
36 | --save_steps 200 \
37 | --save_best False \
38 | --save_total_limit 0 \
39 | --logging_dir ./wandb \
40 | --logging_steps 8 \
41 | --seed 42
42 |
43 |
--------------------------------------------------------------------------------
/scripts/gsm8k/train_verifier.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_API_KEY=
4 | export WANDB_PROJECT=GSM8K-Verifier
5 | export WANDB_ENTITY=
6 |
7 |
8 | n_solution=100
9 | generator_id=llama7b-2-ep2
10 | save_verifier_id=n${n_solution}-scahead-mse-lm-token
11 |
12 |
13 | checkpoint_dir=~/models/gsm8k/generators/${generator_id}
14 |
15 | final_id=${generator_id}-${save_verifier_id}
16 | save_dir=~/models/gsm8k/verifiers/${generator_id}-${experimentID}
17 | export WANDB_NAME=${generator_id}-${experimentID}
18 |
19 |
20 |
21 | accelerate launch \
22 | --config_file ./configs/zero1.yaml \
23 | --main_process_port=20104 \
24 | train_verifier.py \
25 | --model_name_or_path ${checkpoint_dir} \
26 | --data_dir data/gsm8k/model_generation \
27 | --target_set train \
28 | --save_dir ${save_dir} \
29 | --generator_id ${generator_id} \
30 | --dedup True \
31 | --per_problem_sampling_solution ${n_solution} \
32 | --loss_level token \
33 | --loss_on_llm True \
34 | --num_train_epoches 1 \
35 | --eval_steps 1000 \
36 | --per_device_train_batch_size 64 \
37 | --per_device_eval_batch_size 64 \
38 | --gradient_accumulation_steps 1 \
39 | --gradient_checkpointing True \
40 | --learning_rate 1e-5 \
41 | --weight_decay 0 \
42 | --lr_scheduler_type "linear" \
43 | --warmup_steps 0 \
44 | --save_epoches 1 \
45 | --save_best False \
46 | --save_total_limit 0 \
47 | --logging_dir ./wandb \
48 | --logging_steps 20 \
49 | --seed 42
50 |
51 |
--------------------------------------------------------------------------------
/utils/constants.py:
--------------------------------------------------------------------------------
1 | IGNORE_INDEX = -100
2 | DEFAULT_PAD_TOKEN = ""
3 | DEFAULT_BOS_TOKEN = ""
4 | DEFAULT_EOS_TOKEN = ""
5 | DEFAULT_UNK_TOKEN = ""
6 |
7 | LLAMA_EQUALS_TOKENS = set([353, 3892, 29922, 10457]) # _=, )=, =, =-
8 | LLAMA_LEFTMARK_TOKENS = set([3532, 9314]) # <<, _<<
9 | LLAMA_RIGHTMARK_TOKEN = 6778 # >>
10 | LLAMA_NEWLINE_TOKEN = 13 # \n
11 |
12 |
--------------------------------------------------------------------------------
/utils/flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
1 | # refer to https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama2_flash_attn_monkey_patch.py
2 |
3 | import warnings
4 | from typing import Optional, Tuple
5 |
6 | import torch
7 | from flash_attn import __version__ as flash_attn_version
8 | from flash_attn.bert_padding import pad_input, unpad_input
9 | from flash_attn.flash_attn_interface import (
10 | flash_attn_func,
11 | flash_attn_varlen_kvpacked_func,
12 | )
13 | from transformers.models.llama.modeling_llama import (
14 | LlamaAttention,
15 | LlamaModel,
16 | rotate_half,
17 | )
18 | from transformers.models.mistral.modeling_mistral import (
19 | MistralAttention,
20 | MistralModel,
21 | rotate_half,
22 | )
23 |
24 |
25 |
26 | def apply_rotary_pos_emb(q, k, cos_sin, position_ids):
27 | gather_indices = position_ids[:, :, None, None] # [bsz, seq_len, 1, 1]
28 | gather_indices = gather_indices.repeat(
29 | 1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
30 | )
31 | bsz = gather_indices.shape[0]
32 | cos, sin = (
33 | torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
34 | for x in cos_sin
35 | )
36 | q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
37 | return q, k
38 |
39 |
40 | def forward(
41 | self,
42 | hidden_states: torch.Tensor,
43 | attention_mask: Optional[torch.Tensor] = None,
44 | position_ids: Optional[torch.Tensor] = None,
45 | past_key_value: Optional[Tuple[torch.Tensor]] = None,
46 | output_attentions: bool = False,
47 | use_cache: bool = False,
48 | padding_mask: Optional[torch.Tensor] = None,
49 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
50 | if output_attentions:
51 | warnings.warn(
52 | "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
53 | )
54 |
55 | bsz, q_len, _ = hidden_states.size()
56 | kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
57 |
58 | q, k, v = (
59 | op(hidden_states).view(bsz, q_len, nh, self.head_dim)
60 | for op, nh in (
61 | (self.q_proj, self.num_heads),
62 | (self.k_proj, kv_heads),
63 | (self.v_proj, kv_heads),
64 | )
65 | )
66 | # shape: (b, s, num_heads, head_dim)
67 |
68 | kv_seq_len = k.shape[1]
69 | past_kv_len = 0
70 | if past_key_value is not None:
71 | past_kv_len = past_key_value[0].shape[2]
72 | kv_seq_len += past_kv_len
73 |
74 | cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
75 | q, k = apply_rotary_pos_emb(q, k, cos_sin, position_ids)
76 |
77 | if past_key_value is not None:
78 | assert (
79 | flash_attn_version >= "2.1.0"
80 | ), "past_key_value support requires flash-attn >= 2.1.0"
81 | # reuse k, v
82 | k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
83 | v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
84 |
85 | past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
86 |
87 | if attention_mask is None:
88 | output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
89 | bsz, q_len, -1
90 | )
91 | else:
92 | q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
93 | # We can skip concat and call unpad twice but seems better to call unpad only once.
94 | kv, _, cu_k_lens, max_k = unpad_input(
95 | torch.stack((k, v), dim=2), attention_mask
96 | )
97 | output_unpad = flash_attn_varlen_kvpacked_func(
98 | q,
99 | kv,
100 | cu_q_lens,
101 | cu_k_lens,
102 | max_s,
103 | max_k,
104 | 0.0,
105 | softmax_scale=None,
106 | causal=True,
107 | )
108 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
109 | output = pad_input(output_unpad, indices, bsz, q_len)
110 |
111 | return self.o_proj(output), None, past_key_value
112 |
113 |
114 | # Disable the transformation of the attention mask in LlamaModel as flash attention
115 | # takes a boolean key_padding_mask. Fills in the past kv length for use in forward.
116 | def _prepare_decoder_attention_mask(
117 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length
118 | ):
119 |
120 | if attention_mask is not None and torch.all(attention_mask):
121 | return None # This uses the faster call when training with full samples
122 |
123 | return attention_mask
124 |
125 |
126 | def replace_llama_attn_with_flash_attn():
127 | cuda_major, cuda_minor = torch.cuda.get_device_capability()
128 | if cuda_major < 8:
129 | warnings.warn(
130 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
131 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
132 | )
133 |
134 | LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
135 | LlamaAttention.forward = forward
136 |
137 |
--------------------------------------------------------------------------------
/utils/game24/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/OVM/e0a68c253568666dea61ef6dac910f9baa59ebbb/utils/game24/__init__.py
--------------------------------------------------------------------------------
/utils/game24/decoding.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sympy
3 | from typing import List
4 |
5 |
6 | def extract_expression(response: str):
7 | return response.strip().split('\n')[-1].lower().split('the answer is ')[-1].split('=')[0].strip()
8 |
9 |
10 | def extract_expressions(responses: List[str]):
11 | return [extract_expression(response) for response in responses]
12 |
13 |
14 | # refer to https://github.com/princeton-nlp/tree-of-thought-llm/blob/master/src/tot/tasks/game24.py
15 | def get_answer_label(expression: str, question: str):
16 | numbers = re.findall(r'\d+', expression)
17 | problem_numbers = re.findall(r'\d+', question)
18 | if sorted(numbers) != sorted(problem_numbers):
19 | return False
20 | try:
21 | # print(sympy.simplify(expression))
22 | return sympy.simplify(expression) == 24
23 | except Exception as e:
24 | # print(e)
25 | return False
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/utils/game24/metrics.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import numpy as np
4 | from typing import Optional, List, Dict, Set, Any, Union
5 | import torch.distributed as dist
6 | import re
7 | from utils.game24.decoding import extract_expressions, get_answer_label
8 |
9 |
10 |
11 | class GeneratorAnswerAcc:
12 | def __init__(self, n_data: int):
13 | self.n_data = n_data
14 |
15 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1
16 |
17 | self.corrs = []
18 | self.gather = False
19 |
20 | @torch.inference_mode(mode=True)
21 | def __call__(self, completions: List[str], questions: List[str]):
22 | expressions = extract_expressions(completions)
23 |
24 | corrs = [float(get_answer_label(expression, question)) == True for expression, question in zip(expressions, questions)]
25 |
26 | self.corrs.append(corrs)
27 |
28 | def get_metric(self, reset=True):
29 | if not self.gather:
30 | if self.world_size != 1:
31 | gathered_corrs = [None] * self.world_size
32 | for obj, container in [
33 | (self.corrs, gathered_corrs),
34 | ]:
35 | dist.all_gather_object(container, obj)
36 |
37 | flatten_corrs = []
38 | for corrs_gpus in zip(*gathered_corrs):
39 | for corrs in corrs_gpus:
40 | flatten_corrs.extend(corrs)
41 |
42 | else:
43 | flatten_corrs = [item for sublist in self.corrs for item in sublist]
44 |
45 | self.corrs = flatten_corrs[:self.n_data]
46 | self.gather = True
47 |
48 | acc = (sum(self.corrs) / len(self.corrs))
49 |
50 | if reset:
51 | self.corrs = []
52 | self.gather = False
53 | return acc
54 |
55 |
56 | class MultiSamplingAnswerAcc:
57 | def __init__(self, n_data: int = None):
58 | self.n_data = n_data
59 |
60 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1
61 |
62 | self.answers = []
63 | self.questions = []
64 |
65 | def start_new_sol_epoch(self):
66 | self.cur_answers = []
67 | self.cur_questions = []
68 |
69 | def end_the_sol_epoch(self):
70 |
71 | if self.world_size != 1:
72 | gathered_answers, gathered_questions = tuple([None] * self.world_size for _ in range(2))
73 | for obj, container in [
74 | (self.cur_answers, gathered_answers),
75 | (self.cur_questions, gathered_questions),
76 | ]:
77 | dist.all_gather_object(container, obj)
78 |
79 | flatten_answers, flatten_questions = [], []
80 | for answers_gpus, questions_gpus in zip(zip(*gathered_answers), zip(*gathered_questions)):
81 | for answers, questions in zip(answers_gpus, questions_gpus):
82 | flatten_answers.extend(answers)
83 | flatten_questions.extend(questions)
84 |
85 | else:
86 | flatten_answers, flatten_questions = tuple([item for sublist in container for item in sublist]
87 | for container in [self.cur_answers, self.cur_questions])
88 |
89 | self.answers.append(flatten_answers[:self.n_data])
90 | self.questions.append(flatten_questions[:self.n_data])
91 |
92 | @torch.inference_mode(mode=True)
93 | def __call__(self, completions: List[str], questions: List[str]):
94 | expressions = extract_expressions(completions)
95 |
96 | self.cur_answers.append(expressions)
97 | self.cur_questions.append(questions)
98 |
99 | def get_metric(self, n_solution: int=3, reset=True):
100 | # [n_question, n_solution]
101 | answers = self.answers[:n_solution]
102 | # [n_question]
103 | questions = self.questions[:n_solution][0]
104 |
105 | pass_k = np.mean([is_passk(expressions, question) for expressions, question in zip(answers, questions)])
106 | acc_majority = np.mean([is_majority(expressions, question) for expressions, question in zip(answers, questions)])
107 |
108 |
109 | if reset:
110 | self.answers = []
111 | self.questions = []
112 | return pass_k, acc_majority
113 |
114 |
115 | def is_passk(expressions, question):
116 | return any(get_answer_label(expression, question) for expression in expressions)
117 |
118 |
119 | def is_majority(expressions, question):
120 | repres = [get_semantics(expr) for expr in expressions]
121 | final_repre = max(repres, key=repres.count)
122 | index = repres.index(final_repre)
123 | return get_answer_label(expressions[index], question)
124 |
125 |
126 | def get_semantics(expression):
127 | numbers = re.findall(r'\d+', expression)
128 | symbols = re.findall(r'[+\-\*\/]', expression)
129 |
130 | try:
131 | value = eval(expression)
132 | except:
133 | value = None
134 |
135 | value = str(value)
136 | if value[-2:] == '.0':
137 | value = value[:-2]
138 | return tuple(sorted(numbers) + sorted(symbols) + [f'value={value}'])
139 |
140 |
141 |
--------------------------------------------------------------------------------
/utils/gsm8k/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/OVM/e0a68c253568666dea61ef6dac910f9baa59ebbb/utils/gsm8k/__init__.py
--------------------------------------------------------------------------------
/utils/gsm8k/decoding.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | import signal
3 | import torch
4 | import json
5 | import os
6 | import re
7 |
8 |
9 | # ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
10 | ANS_RE = re.compile(r"The answer is (\-?[0-9\.\,]+)")
11 | INVALID_ANS = "[invalid]"
12 |
13 |
14 |
15 | def extract_answer(completion):
16 | match = ANS_RE.search(completion)
17 | if match:
18 | match_str = match.group(1).strip()
19 | st_str = standardize_value_str(match_str)
20 | try: eval(st_str); return st_str
21 | except: ...
22 | return INVALID_ANS
23 |
24 | def extract_answers(completions):
25 | return [extract_answer(completion) for completion in completions]
26 |
27 | def standardize_value_str(x):
28 | """Standardize numerical values"""
29 | y = x.replace(",", "")
30 | if '.' in y:
31 | y = y.rstrip('0')
32 | if y[-1] == '.':
33 | y = y[:-1]
34 | if not len(y):
35 | return INVALID_ANS
36 | if y[0] == '.':
37 | y = '0' + y
38 | if y[-1] == '%':
39 | y = str(eval(y[:-1]) / 100)
40 | return y.rstrip('.')
41 |
42 | def get_answer_label(response_answer, gt):
43 | if response_answer == INVALID_ANS:
44 | return INVALID_ANS
45 | return response_answer == gt
46 |
47 |
48 |
49 | # taken from
50 | # https://stackoverflow.com/questions/492519/timeout-on-a-function-call
51 | @contextmanager
52 | def timeout(duration, formula):
53 | def timeout_handler(signum, frame):
54 | raise Exception(f"'{formula}': timed out after {duration} seconds")
55 |
56 | signal.signal(signal.SIGALRM, timeout_handler)
57 | signal.alarm(duration)
58 | yield
59 | signal.alarm(0)
60 |
61 |
62 | def eval_with_timeout(formula, max_time=3):
63 | try:
64 | with timeout(max_time, formula):
65 | return round(eval(formula), ndigits=4)
66 | except Exception as e:
67 | signal.alarm(0)
68 | print(f"Warning: Failed to eval {formula}, exception: {e}")
69 | return None
70 |
71 |
72 | # refer to https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
73 | def use_calculator(sample):
74 | if "<<" not in sample:
75 | return None
76 |
77 | parts = sample.split("<<")
78 | remaining = parts[-1]
79 | if ">>" in remaining:
80 | return None
81 | if "=" not in remaining:
82 | return None
83 | lhs = remaining.split("=")[0]
84 | lhs = lhs.replace(",", "")
85 | if any([x not in "0123456789*+-/.()" for x in lhs]):
86 | return None
87 | ans = eval_with_timeout(lhs)
88 | if remaining[-1] == '-' and ans is not None and ans < 0:
89 | ans = -ans
90 | return ans
91 |
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/utils/gsm8k/metrics.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import numpy as np
4 | from typing import Optional, List, Dict, Set, Any, Union
5 | import torch.distributed as dist
6 | from utils.gsm8k.decoding import INVALID_ANS, extract_answers, get_answer_label
7 |
8 |
9 |
10 | class GeneratorAnswerAcc:
11 | def __init__(self, n_data: int):
12 | self.n_data = n_data
13 |
14 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1
15 |
16 | self.corrs = []
17 | self.gather = False
18 |
19 | @torch.inference_mode(mode=True)
20 | def __call__(self, completions: List[str], gts: List[str]):
21 | answers = extract_answers(completions)
22 |
23 | corrs = [float(get_answer_label(answer, gt) == True) for answer, gt in zip(answers, gts)]
24 |
25 | self.corrs.append(corrs)
26 |
27 | def get_metric(self, reset=True):
28 | if not self.gather:
29 | if self.world_size != 1:
30 | gathered_corrs = [None] * self.world_size
31 | for obj, container in [
32 | (self.corrs, gathered_corrs),
33 | ]:
34 | dist.all_gather_object(container, obj)
35 |
36 | flatten_corrs = []
37 | for corrs_gpus in zip(*gathered_corrs):
38 | for corrs in corrs_gpus:
39 | flatten_corrs.extend(corrs)
40 |
41 | else:
42 | flatten_corrs = [item for sublist in self.corrs for item in sublist]
43 |
44 | self.corrs = flatten_corrs[:self.n_data]
45 | self.gather = True
46 |
47 | acc = (sum(self.corrs) / len(self.corrs))
48 |
49 | if reset:
50 | self.corrs = []
51 | self.gather = False
52 | return acc
53 |
54 |
55 | class MultiSamplingAnswerAcc:
56 | def __init__(self, n_data: int = None):
57 | self.n_data = n_data
58 |
59 | self.world_size = dist.get_world_size() if dist.is_initialized() else 1
60 |
61 | self.answers = []
62 | self.gts = []
63 |
64 | def start_new_sol_epoch(self):
65 | self.cur_answers = []
66 | self.cur_gts = []
67 |
68 | def end_the_sol_epoch(self):
69 |
70 | if self.world_size != 1:
71 | gathered_answers, gathered_gts = tuple([None] * self.world_size for _ in range(2))
72 | for obj, container in [
73 | (self.cur_answers, gathered_answers),
74 | (self.cur_gts, gathered_gts),
75 | ]:
76 | dist.all_gather_object(container, obj)
77 |
78 | flatten_answers, flatten_gts = [], []
79 | for answers_gpus, gts_gpus in zip(zip(*gathered_answers), zip(*gathered_gts)):
80 | for answers, gts in zip(answers_gpus, gts_gpus):
81 | flatten_answers.extend(answers)
82 | flatten_gts.extend(gts)
83 |
84 | else:
85 | flatten_answers, flatten_gts = tuple([item for sublist in container for item in sublist]
86 | for container in [self.cur_answers, self.cur_gts])
87 |
88 | self.answers.append(flatten_answers[:self.n_data])
89 | self.gts.append(flatten_gts[:self.n_data])
90 |
91 |
92 | @torch.inference_mode(mode=True)
93 | def __call__(self, completions: List[str], gts: List[str]):
94 | answers = extract_answers(completions)
95 |
96 | answers = [float(a) if a != INVALID_ANS else float('nan') for a in answers]
97 | gts = [float(gt) for gt in gts]
98 |
99 | self.cur_answers.append(answers)
100 | self.cur_gts.append(gts)
101 |
102 |
103 | def get_metric(self, n_solution: int=3, reset=True):
104 |
105 | assert all(x == self.gts[0] for x in self.gts)
106 |
107 | # [n_question]
108 | gts = np.array(self.gts[0])
109 | # [n_question, n_solution]
110 | answers = np.stack(self.answers[:n_solution], axis=1)
111 | # print('answers:', answers.shape)
112 |
113 | pass_k = (answers == gts.reshape((-1, 1))).any(1).mean(0)
114 | acc_majority = np.mean([is_majority(a, gt, ignore=float('nan')) for a, gt in zip(answers, gts)])
115 |
116 | if reset:
117 | self.gts = []
118 | self.answers = []
119 | return pass_k, acc_majority
120 |
121 |
122 |
123 | def is_passk(answers, gt):
124 | return gt in answers
125 |
126 | def is_majority(answers, gt, ignore = INVALID_ANS):
127 | filter_answers = list(filter(lambda x: x!=ignore, answers))
128 | final_answer = max(filter_answers, key=filter_answers.count)
129 | return final_answer == gt
130 |
131 |
132 |
--------------------------------------------------------------------------------
/utils/optim.py:
--------------------------------------------------------------------------------
1 | from transformers import AdamW
2 | from transformers import get_scheduler
3 | import transformers
4 | from typing import Optional, List, Dict, Set, Any, Union
5 | from dataclasses import dataclass
6 | import os
7 |
8 | def get_optimizers(model: transformers.AutoModelForCausalLM, training_args: dataclass) -> Dict:
9 | no_decay = ["bias", "LayerNorm.weight"]
10 | optimizer_grouped_parameters = [
11 | {
12 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
13 | "weight_decay": training_args.weight_decay,
14 | },
15 | {
16 | "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
17 | "weight_decay": 0.0,
18 | },
19 | ]
20 |
21 | optim = AdamW(
22 | optimizer_grouped_parameters,
23 | lr=training_args.learning_rate,
24 | # weight_decay=training_args.weight_decay
25 | )
26 | lr_scheduler = get_scheduler(
27 | training_args.lr_scheduler_type,
28 | optimizer=optim,
29 | # num_warmup_steps=training_args.num_updating_warmup_steps_aggr_devices,
30 | # num_training_steps=training_args.num_updating_steps_aggr_devices,
31 | num_warmup_steps=training_args.num_updating_warmup_steps,
32 | num_training_steps=training_args.num_updating_steps,
33 | )
34 | return optim, lr_scheduler
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/utils/states.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | import json
4 | from dataclasses import dataclass
5 | import random
6 | import math
7 | import numpy as np
8 | from accelerate import Accelerator
9 |
10 |
11 | def set_deepspeed_config(accelerator: Accelerator, training_args: dataclass):
12 | world_size = int(os.environ.get("WORLD_SIZE", 1))
13 | accelerator.state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = training_args.per_device_train_batch_size
14 | accelerator.state.deepspeed_plugin.deepspeed_config['train_batch_size'] = training_args.per_device_train_batch_size * world_size * accelerator.gradient_accumulation_steps
15 |
16 |
17 | def set_training_states(data_module: dict, training_args: dataclass):
18 | set_num_steps_per_epoch(data_module, training_args)
19 | set_num_training_steps(training_args)
20 | set_num_updating_steps(training_args)
21 | set_num_eval_steps(training_args)
22 | set_per_eval_steps(training_args)
23 | set_num_warmup_steps(training_args)
24 |
25 | set_num_logging_steps(training_args)
26 | set_per_save_steps(training_args)
27 |
28 | print(f"+ [Training States] There are {training_args.num_training_steps} steps in total.")
29 |
30 |
31 | def set_num_steps_per_epoch(data_module: dict, training_args: dataclass):
32 | num_devices = int(os.environ.get("WORLD_SIZE", 1))
33 |
34 | len_train_set_per_device = math.ceil(len(data_module["train_dataset"]) / num_devices)
35 | num_train_steps_per_device = math.ceil(len_train_set_per_device / training_args.per_device_train_batch_size)
36 | num_updating_steps_per_epoch = num_train_steps_per_device // training_args.gradient_accumulation_steps
37 |
38 | len_eval_set_per_device = math.ceil(len(data_module["val_dataset"]) / num_devices) if data_module["val_dataset"] is not None else None
39 | num_eval_steps_per_device = math.ceil(len_eval_set_per_device / training_args.per_device_eval_batch_size) if data_module["val_dataset"] is not None else None
40 |
41 | training_args.num_training_steps_per_epoch = num_train_steps_per_device
42 | training_args.num_updating_steps_per_epoch = num_updating_steps_per_epoch
43 | training_args.num_eval_steps_per_epoch = num_eval_steps_per_device
44 |
45 | def set_num_training_steps(training_args: dataclass):
46 | if training_args.max_steps != -1:
47 | num_training_steps = training_args.max_steps
48 | else:
49 | assert training_args.num_train_epoches != -1
50 | num_training_steps = training_args.num_training_steps_per_epoch * training_args.num_train_epoches
51 | num_training_steps_aggr_devices = num_training_steps * int(os.environ.get("WORLD_SIZE", 1))
52 |
53 | training_args.num_training_steps = num_training_steps
54 | training_args.num_training_steps_aggr_devices = num_training_steps_aggr_devices
55 |
56 | def set_num_updating_steps(training_args: dataclass):
57 | num_updating_steps = training_args.num_training_steps // training_args.gradient_accumulation_steps
58 | num_updating_steps_aggr_devices = num_updating_steps * int(os.environ.get("WORLD_SIZE", 1))
59 |
60 | training_args.num_updating_steps = num_updating_steps
61 | training_args.num_updating_steps_aggr_devices = num_updating_steps_aggr_devices
62 |
63 |
64 | def set_num_eval_steps(training_args: dataclass):
65 | training_args.num_eval_steps = training_args.num_eval_steps_per_epoch
66 |
67 | def set_per_eval_steps(training_args: dataclass):
68 | if training_args.eval_steps != -1:
69 | per_eval_steps = training_args.eval_steps
70 | else:
71 | assert training_args.eval_epoches != -1
72 | per_eval_steps = training_args.num_training_steps_per_epoch * training_args.eval_epoches
73 |
74 | training_args.per_eval_steps = per_eval_steps
75 |
76 | def set_num_warmup_steps(training_args: dataclass):
77 | # if training_args.warmup_steps != -1:
78 | # num_warmup_steps_forward = training_args.warmup_steps
79 | # else:
80 | # assert training_args.warmup_ratio != -1
81 | # num_warmup_steps_forward = int(training_args.num_training_steps * training_args.warmup_ratio)
82 | # num_updating_warmup_steps = num_warmup_steps_forward // training_args.gradient_accumulation_steps
83 | # num_updating_warmup_steps_aggr_devices = num_updating_warmup_steps * int(os.environ.get("WORLD_SIZE", 1))
84 | if training_args.warmup_steps != -1:
85 | num_updating_warmup_steps = training_args.warmup_steps
86 | else:
87 | assert training_args.warmup_ratio != -1
88 | num_updating_warmup_steps = int(training_args.num_updating_steps * training_args.warmup_ratio)
89 | num_updating_warmup_steps_aggr_devices = num_updating_warmup_steps * int(os.environ.get("WORLD_SIZE", 1))
90 |
91 | training_args.num_updating_warmup_steps = num_updating_warmup_steps
92 | training_args.num_updating_warmup_steps_aggr_devices = num_updating_warmup_steps_aggr_devices
93 |
94 | def set_num_logging_steps(training_args: dataclass):
95 | if training_args.logging_steps != -1:
96 | num_logging_steps = training_args.logging_steps
97 | else:
98 | assert training_args.logging_epoches != -1
99 | num_logging_steps = training_args.num_training_steps_per_epoch * training_args.logging_epoches
100 |
101 | training_args.num_logging_steps = num_logging_steps
102 |
103 | def set_per_save_steps(training_args: dataclass):
104 | if training_args.save_steps != -1:
105 | per_save_steps = training_args.save_steps
106 | else:
107 | assert training_args.save_epoches != -1
108 | per_save_steps = training_args.num_training_steps_per_epoch * training_args.save_epoches
109 |
110 | training_args.per_save_steps = per_save_steps
111 |
112 |
113 | def set_random_seed(seed: int):
114 | random.seed(seed)
115 | np.random.seed(seed)
116 | torch.manual_seed(seed)
117 | torch.cuda.manual_seed_all(seed)
118 | torch.cuda.manual_seed(seed)
119 |
120 |
121 |
--------------------------------------------------------------------------------