├── .gitignore ├── LICENSE ├── README.md ├── README_zh.md ├── URLs ├── dispatcher.py ├── gunicorn_conf.py ├── start_gunicorn.sh ├── transformers_url.py ├── transformers_url_m.py ├── vllm_url.py └── vllm_url_m.py ├── __init__.py ├── configs ├── make_config.py └── show_datasets.py ├── data_process.py ├── datasets ├── __init__.py ├── afqmc │ ├── config │ │ ├── afqmc_gen.json │ │ └── afqmc_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── agieval │ ├── config │ │ ├── aqua-rat_gen.json │ │ ├── aqua-rat_ppl.json │ │ ├── gaokao-biology_gen.json │ │ ├── gaokao-biology_ppl.json │ │ ├── gaokao-chemistry_gen.json │ │ ├── gaokao-chemistry_ppl.json │ │ ├── gaokao-chinese_gen.json │ │ ├── gaokao-chinese_ppl.json │ │ ├── gaokao-english_gen.json │ │ ├── gaokao-english_ppl.json │ │ ├── gaokao-geography_gen.json │ │ ├── gaokao-geography_ppl.json │ │ ├── gaokao-history_gen.json │ │ ├── gaokao-history_ppl.json │ │ ├── gaokao-mathcloze_gen.json │ │ ├── gaokao-mathqa_gen.json │ │ ├── gaokao-mathqa_ppl.json │ │ ├── gaokao-physics_gen.json │ │ ├── jec-qa-ca_gen.json │ │ ├── jec-qa-kd_gen.json │ │ ├── logiqa-en_gen.json │ │ ├── logiqa-en_ppl.json │ │ ├── logiqa-zh_gen.json │ │ ├── logiqa-zh_ppl.json │ │ ├── lsat-ar_gen.json │ │ ├── lsat-ar_ppl.json │ │ ├── lsat-lr_gen.json │ │ ├── lsat-lr_ppl.json │ │ ├── lsat-rc_gen.json │ │ ├── lsat-rc_ppl.json │ │ ├── math_gen.json │ │ ├── sat-en-without-passage_gen.json │ │ ├── sat-en-without-passage_ppl.json │ │ ├── sat-en_gen.json │ │ ├── sat-en_ppl.json │ │ ├── sat-math_gen.json │ │ └── sat-math_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_ppl_v0.py ├── arc-c │ ├── config │ │ ├── arc-c_gen.json │ │ └── arc-c_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── arc-e │ ├── config │ │ ├── arc-e_gen.json │ │ └── arc-e_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── ax-b │ ├── config │ │ ├── ax-b_gen.json │ │ └── ax-b_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── ax-g │ ├── config │ │ ├── ax-g_gen.json │ │ └── ax-g_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── bbh-cot │ ├── config │ │ ├── boolean-expressions_gen.json │ │ ├── causal-judgement_gen.json │ │ ├── date-understanding_gen.json │ │ ├── disambiguation-qa_gen.json │ │ ├── dyck-languages_gen.json │ │ ├── formal-fallacies_gen.json │ │ ├── geometric-shapes_gen.json │ │ ├── hyperbaton_gen.json │ │ ├── logical-deduction-five-objects_gen.json │ │ ├── logical-deduction-seven-objects_gen.json │ │ ├── logical-deduction-three-objects_gen.json │ │ ├── movie-recommendation_gen.json │ │ ├── multistep-arithmetic-two_gen.json │ │ ├── navigate_gen.json │ │ ├── object-counting_gen.json │ │ ├── penguins-in-a-table_gen.json │ │ ├── reasoning-about-colored-objects_gen.json │ │ ├── ruin-names_gen.json │ │ ├── salient-translation-error-detection_gen.json │ │ ├── snarks_gen.json │ │ ├── sports-understanding_gen.json │ │ ├── temporal-sequences_gen.json │ │ ├── tracking-shuffled-objects-five-objects_gen.json │ │ ├── tracking-shuffled-objects-seven-objects_gen.json │ │ ├── tracking-shuffled-objects-three-objects_gen.json │ │ ├── web-of-lies_gen.json │ │ └── word-sorting_gen.json │ ├── cot-prompts │ │ ├── boolean_expressions.txt │ │ ├── causal_judgement.txt │ │ ├── date_understanding.txt │ │ ├── disambiguation_qa.txt │ │ ├── dyck_languages.txt │ │ ├── formal_fallacies.txt │ │ ├── geometric_shapes.txt │ │ ├── hyperbaton.txt │ │ ├── logical_deduction_five_objects.txt │ │ ├── logical_deduction_seven_objects.txt │ │ ├── logical_deduction_three_objects.txt │ │ ├── movie_recommendation.txt │ │ ├── multistep_arithmetic_two.txt │ │ ├── navigate.txt │ │ ├── object_counting.txt │ │ ├── penguins_in_a_table.txt │ │ ├── reasoning_about_colored_objects.txt │ │ ├── ruin_names.txt │ │ ├── salient_translation_error_detection.txt │ │ ├── snarks.txt │ │ ├── sports_understanding.txt │ │ ├── temporal_sequences.txt │ │ ├── tracking_shuffled_objects_five_objects.txt │ │ ├── tracking_shuffled_objects_seven_objects.txt │ │ ├── tracking_shuffled_objects_three_objects.txt │ │ ├── web_of_lies.txt │ │ └── word_sorting.txt │ ├── make_dataset.py │ └── transform_gen_cot.py ├── bbh │ ├── config │ │ ├── boolean-expressions_gen.json │ │ ├── causal-judgement_gen.json │ │ ├── date-understanding_gen.json │ │ ├── disambiguation-qa_gen.json │ │ ├── dyck-languages_gen.json │ │ ├── formal-fallacies_gen.json │ │ ├── geometric-shapes_gen.json │ │ ├── hyperbaton_gen.json │ │ ├── logical-deduction-five-objects_gen.json │ │ ├── logical-deduction-seven-objects_gen.json │ │ ├── logical-deduction-three-objects_gen.json │ │ ├── movie-recommendation_gen.json │ │ ├── multistep-arithmetic-two_gen.json │ │ ├── navigate_gen.json │ │ ├── object-counting_gen.json │ │ ├── penguins-in-a-table_gen.json │ │ ├── reasoning-about-colored-objects_gen.json │ │ ├── ruin-names_gen.json │ │ ├── salient-translation-error-detection_gen.json │ │ ├── snarks_gen.json │ │ ├── sports-understanding_gen.json │ │ ├── temporal-sequences_gen.json │ │ ├── tracking-shuffled-objects-five-objects_gen.json │ │ ├── tracking-shuffled-objects-seven-objects_gen.json │ │ ├── tracking-shuffled-objects-three-objects_gen.json │ │ ├── web-of-lies_gen.json │ │ └── word-sorting_gen.json │ ├── make_dataset.py │ └── transform_gen_v0.py ├── boolq │ ├── config │ │ ├── boolq_gen.json │ │ └── boolq_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── bustm │ ├── config │ │ ├── bustm_gen.json │ │ └── bustm_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── c3 │ ├── config │ │ ├── dialog_gen.json │ │ ├── dialog_ppl.json │ │ ├── mixed_gen.json │ │ └── mixed_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── cb │ ├── config │ │ ├── cb_gen.json │ │ └── cb_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── ceval │ ├── config │ │ ├── accountant_gen.json │ │ ├── advanced-mathematics_gen.json │ │ ├── art-studies_gen.json │ │ ├── basic-medicine_gen.json │ │ ├── business-administration_gen.json │ │ ├── chinese-language-and-literature_gen.json │ │ ├── civil-servant_gen.json │ │ ├── clinical-medicine_gen.json │ │ ├── college-chemistry_gen.json │ │ ├── college-economics_gen.json │ │ ├── college-physics_gen.json │ │ ├── college-programming_gen.json │ │ ├── computer-architecture_gen.json │ │ ├── computer-network_gen.json │ │ ├── discrete-mathematics_gen.json │ │ ├── education-science_gen.json │ │ ├── electrical-engineer_gen.json │ │ ├── environmental-impact-assessment-engineer_gen.json │ │ ├── fire-engineer_gen.json │ │ ├── high-school-biology_gen.json │ │ ├── high-school-chemistry_gen.json │ │ ├── high-school-chinese_gen.json │ │ ├── high-school-geography_gen.json │ │ ├── high-school-history_gen.json │ │ ├── high-school-mathematics_gen.json │ │ ├── high-school-physics_gen.json │ │ ├── high-school-politics_gen.json │ │ ├── ideological-and-moral-cultivation_gen.json │ │ ├── law_gen.json │ │ ├── legal-professional_gen.json │ │ ├── logic_gen.json │ │ ├── mao-zedong-thought_gen.json │ │ ├── marxism_gen.json │ │ ├── metrology-engineer_gen.json │ │ ├── middle-school-biology_gen.json │ │ ├── middle-school-chemistry_gen.json │ │ ├── middle-school-geography_gen.json │ │ ├── middle-school-history_gen.json │ │ ├── middle-school-mathematics_gen.json │ │ ├── middle-school-physics_gen.json │ │ ├── middle-school-politics_gen.json │ │ ├── modern-chinese-history_gen.json │ │ ├── operating-system_gen.json │ │ ├── physician_gen.json │ │ ├── plant-protection_gen.json │ │ ├── probability-and-statistics_gen.json │ │ ├── professional-tour-guide_gen.json │ │ ├── sports-science_gen.json │ │ ├── tax-accountant_gen.json │ │ ├── teacher-qualification_gen.json │ │ ├── urban-and-rural-planner_gen.json │ │ └── veterinary-medicine_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── chid │ ├── config │ │ ├── chid_gen.json │ │ └── chid_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── cluewsc │ ├── config │ │ ├── cluewsc_gen.json │ │ └── cluewsc_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── cmmlu │ ├── config │ │ ├── agronomy_gen.json │ │ ├── anatomy_gen.json │ │ ├── ancient-chinese_gen.json │ │ ├── arts_gen.json │ │ ├── astronomy_gen.json │ │ ├── business-ethics_gen.json │ │ ├── chinese-civil-service-exam_gen.json │ │ ├── chinese-driving-rule_gen.json │ │ ├── chinese-food-culture_gen.json │ │ ├── chinese-foreign-policy_gen.json │ │ ├── chinese-history_gen.json │ │ ├── chinese-literature_gen.json │ │ ├── chinese-teacher-qualification_gen.json │ │ ├── clinical-knowledge_gen.json │ │ ├── college-actuarial-science_gen.json │ │ ├── college-education_gen.json │ │ ├── college-engineering-hydrology_gen.json │ │ ├── college-law_gen.json │ │ ├── college-mathematics_gen.json │ │ ├── college-medical-statistics_gen.json │ │ ├── college-medicine_gen.json │ │ ├── computer-science_gen.json │ │ ├── computer-security_gen.json │ │ ├── conceptual-physics_gen.json │ │ ├── construction-project-management_gen.json │ │ ├── economics_gen.json │ │ ├── education_gen.json │ │ ├── electrical-engineering_gen.json │ │ ├── elementary-chinese_gen.json │ │ ├── elementary-commonsense_gen.json │ │ ├── elementary-information-and-technology_gen.json │ │ ├── elementary-mathematics_gen.json │ │ ├── ethnology_gen.json │ │ ├── food-science_gen.json │ │ ├── genetics_gen.json │ │ ├── global-facts_gen.json │ │ ├── high-school-biology_gen.json │ │ ├── high-school-chemistry_gen.json │ │ ├── high-school-geography_gen.json │ │ ├── high-school-mathematics_gen.json │ │ ├── high-school-physics_gen.json │ │ ├── high-school-politics_gen.json │ │ ├── human-sexuality_gen.json │ │ ├── international-law_gen.json │ │ ├── journalism_gen.json │ │ ├── jurisprudence_gen.json │ │ ├── legal-and-moral-basis_gen.json │ │ ├── logical_gen.json │ │ ├── machine-learning_gen.json │ │ ├── management_gen.json │ │ ├── marketing_gen.json │ │ ├── marxist-theory_gen.json │ │ ├── modern-chinese_gen.json │ │ ├── nutrition_gen.json │ │ ├── philosophy_gen.json │ │ ├── professional-accounting_gen.json │ │ ├── professional-law_gen.json │ │ ├── professional-medicine_gen.json │ │ ├── professional-psychology_gen.json │ │ ├── public-relations_gen.json │ │ ├── security-study_gen.json │ │ ├── sociology_gen.json │ │ ├── sports-science_gen.json │ │ ├── traditional-chinese-medicine_gen.json │ │ ├── virology_gen.json │ │ ├── world-history_gen.json │ │ └── world-religions_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── cmnli │ ├── config │ │ ├── cmnli_gen.json │ │ └── cmnli_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── commonsenseqa │ ├── config │ │ ├── commonsenseqa_gen.json │ │ └── commonsenseqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── copa │ ├── config │ │ ├── copa_gen.json │ │ └── copa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── drcd │ ├── config │ │ └── drcd_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── eprstmt │ ├── config │ │ ├── eprstmt_gen.json │ │ └── eprstmt_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── flores │ ├── config │ │ └── flores_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── gaokaobench │ ├── config │ │ ├── Biology-MCQs_gen.json │ │ ├── Chemistry-MCQs_gen.json │ │ ├── Chinese-Lang-and-Usage-MCQs_gen.json │ │ ├── Chinese-Modern-Lit_gen.json │ │ ├── English-Cloze-Test_gen.json │ │ ├── English-Fill-in-Blanks_gen.json │ │ ├── English-MCQs_gen.json │ │ ├── English-Reading-Comp_gen.json │ │ ├── Geography-MCQs_gen.json │ │ ├── History-MCQs_gen.json │ │ ├── Math-I-MCQs_gen.json │ │ ├── Math-II-MCQs_gen.json │ │ ├── Physics-MCQs_gen.json │ │ └── Political-Science-MCQs_gen.json │ ├── make_dataset.py │ └── transform_gen_v0.py ├── gsm8k │ ├── config │ │ └── gsm8k_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── hellaswag │ ├── config │ │ ├── hellaswag_gen.json │ │ └── hellaswag_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── humaneval │ ├── config │ │ └── humaneval_gen.json │ ├── make_dataset.py │ └── transform_gen_v0.py ├── jecqa │ ├── config │ │ ├── jecqa_gen.json │ │ └── jecqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── lambada │ ├── config │ │ └── lambada_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── math │ ├── config │ │ ├── algebra_gen.json │ │ ├── counting-and-probability_gen.json │ │ ├── geometry_gen.json │ │ ├── intermediate-algebra_gen.json │ │ ├── number-theory_gen.json │ │ ├── prealgebra_gen.json │ │ └── precalculus_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── mbpp-427 │ ├── config │ │ └── mbpp_gen.json │ ├── make_dataset.py │ └── transform_gen_v0.py ├── mbpp │ ├── config │ │ └── mbpp_gen.json │ ├── make_dataset.py │ └── transform_gen_v0.py ├── medmcqa │ ├── config │ │ ├── medmcqa_gen.json │ │ └── medmcqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── medqa-mcmle │ ├── config │ │ ├── medqa-mcmle_gen.json │ │ └── medqa-mcmle_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── medqa-usmle │ ├── config │ │ ├── medqa-usmle_gen.json │ │ └── medqa-usmle_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── mmlu │ ├── config │ │ ├── abstract-algebra_gen.json │ │ ├── anatomy_gen.json │ │ ├── astronomy_gen.json │ │ ├── business-ethics_gen.json │ │ ├── clinical-knowledge_gen.json │ │ ├── college-biology_gen.json │ │ ├── college-chemistry_gen.json │ │ ├── college-computer-science_gen.json │ │ ├── college-mathematics_gen.json │ │ ├── college-medicine_gen.json │ │ ├── college-physics_gen.json │ │ ├── computer-security_gen.json │ │ ├── conceptual-physics_gen.json │ │ ├── econometrics_gen.json │ │ ├── electrical-engineering_gen.json │ │ ├── elementary-mathematics_gen.json │ │ ├── formal-logic_gen.json │ │ ├── global-facts_gen.json │ │ ├── high-school-biology_gen.json │ │ ├── high-school-chemistry_gen.json │ │ ├── high-school-computer-science_gen.json │ │ ├── high-school-european-history_gen.json │ │ ├── high-school-geography_gen.json │ │ ├── high-school-government-and-politics_gen.json │ │ ├── high-school-macroeconomics_gen.json │ │ ├── high-school-mathematics_gen.json │ │ ├── high-school-microeconomics_gen.json │ │ ├── high-school-physics_gen.json │ │ ├── high-school-psychology_gen.json │ │ ├── high-school-statistics_gen.json │ │ ├── high-school-us-history_gen.json │ │ ├── high-school-world-history_gen.json │ │ ├── human-aging_gen.json │ │ ├── human-sexuality_gen.json │ │ ├── international-law_gen.json │ │ ├── jurisprudence_gen.json │ │ ├── logical-fallacies_gen.json │ │ ├── machine-learning_gen.json │ │ ├── management_gen.json │ │ ├── marketing_gen.json │ │ ├── medical-genetics_gen.json │ │ ├── miscellaneous_gen.json │ │ ├── moral-disputes_gen.json │ │ ├── moral-scenarios_gen.json │ │ ├── nutrition_gen.json │ │ ├── philosophy_gen.json │ │ ├── prehistory_gen.json │ │ ├── professional-accounting_gen.json │ │ ├── professional-law_gen.json │ │ ├── professional-medicine_gen.json │ │ ├── professional-psychology_gen.json │ │ ├── public-relations_gen.json │ │ ├── security-studies_gen.json │ │ ├── sociology_gen.json │ │ ├── us-foreign-policy_gen.json │ │ ├── virology_gen.json │ │ └── world-religions_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── multirc │ ├── config │ │ ├── multirc_gen.json │ │ └── multirc_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── nq-open │ ├── config │ │ └── nq-open_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── ocnli-fc │ ├── config │ │ ├── ocnli-fc_gen.json │ │ └── ocnli-fc_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── ocnli │ ├── config │ │ ├── ocnli_gen.json │ │ └── ocnli_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── openbookqa │ ├── config │ │ ├── openbookqa_gen.json │ │ └── openbookqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── piqa │ ├── config │ │ ├── piqa_gen.json │ │ └── piqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── quac │ ├── config │ │ └── quac_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── race │ ├── config │ │ ├── high_gen.json │ │ ├── high_ppl.json │ │ ├── middle_gen.json │ │ └── middle_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── record │ ├── config │ │ └── record_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── rte │ ├── config │ │ ├── rte_gen.json │ │ └── rte_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── siqa │ ├── config │ │ ├── siqa_gen.json │ │ └── siqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── squad │ ├── config │ │ └── squad_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── storycloze │ ├── config │ │ ├── storycloze_gen.json │ │ └── storycloze_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── strategyqa │ ├── config │ │ ├── strategyqa_gen.json │ │ └── strategyqa_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── summedits │ ├── config │ │ ├── billsum_gen.json │ │ ├── billsum_ppl.json │ │ ├── ectsum_gen.json │ │ ├── ectsum_ppl.json │ │ ├── news_gen.json │ │ ├── news_ppl.json │ │ ├── podcast_gen.json │ │ ├── podcast_ppl.json │ │ ├── qmsumm_gen.json │ │ ├── qmsumm_ppl.json │ │ ├── sales-call_gen.json │ │ ├── sales-call_ppl.json │ │ ├── sales-email_gen.json │ │ ├── sales-email_ppl.json │ │ ├── samsum_gen.json │ │ ├── samsum_ppl.json │ │ ├── scitldr_gen.json │ │ ├── scitldr_ppl.json │ │ ├── shakespeare_gen.json │ │ └── shakespeare_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── theoremqa │ ├── config │ │ └── theoremqa_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── tnews │ ├── config │ │ ├── tnews_gen.json │ │ └── tnews_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── triviaqa │ ├── config │ │ ├── web_gen.json │ │ └── wikipedia_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── truthfulqa │ ├── config │ │ ├── mc1_ppl.json │ │ └── mc2_ppl.json │ ├── make_dataset.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── tydiqa │ ├── config │ │ └── tydiqa_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── wic │ ├── config │ │ ├── wic_gen.json │ │ └── wic_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── winogender │ ├── config │ │ ├── female_gen.json │ │ ├── female_ppl.json │ │ ├── male_gen.json │ │ ├── male_ppl.json │ │ ├── neutral_gen.json │ │ └── neutral_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── winogrande │ ├── config │ │ ├── winogrande_gen.json │ │ └── winogrande_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── wmt20-en-zh │ ├── config │ │ ├── news_gen.json │ │ └── suites_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py ├── wmt20-zh-en │ ├── config │ │ ├── news_gen.json │ │ └── suites_gen.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ └── transform_gen_v1.py └── wsc │ ├── config │ ├── wsc_gen.json │ └── wsc_ppl.json │ ├── make_dataset.py │ ├── transform_gen_v0.py │ ├── transform_gen_v1.py │ ├── transform_ppl_v0.py │ └── transform_ppl_v1.py ├── docs ├── pics │ ├── ultraeval_logo_white.jpg │ └── ultraeval_pipeline_white.png └── tutorials │ ├── en │ ├── configuration_file │ │ ├── config.md │ │ ├── make_dataset.md │ │ ├── metric.md │ │ ├── model_params.md │ │ ├── postprocess.md │ │ └── transform.md │ ├── customization │ │ ├── individual_models.md │ │ ├── new_config.md │ │ ├── new_dataset.md │ │ ├── new_metric.md │ │ └── new_postprocess.md │ ├── deployment_model │ │ ├── acceleration.md │ │ ├── deployment.md │ │ └── model_download.md │ ├── evaluation │ │ ├── model_instantiation.md │ │ └── task_instantiation.md │ └── ultraeval.md │ └── zh │ ├── configuration_file │ ├── config.md │ ├── make_dataset.md │ ├── metric.md │ ├── model_params.md │ ├── postprocess.md │ └── transform.md │ ├── customization │ ├── individual_models.md │ ├── new_config.md │ ├── new_dataset.md │ ├── new_metric.md │ └── new_postprocess.md │ ├── deployment_model │ ├── acceleration.md │ ├── deployment.md │ └── model_download.md │ ├── evaluation │ ├── model_instantiation.md │ └── task_instantiation.md │ └── ultraeval.md ├── leaderboard ├── Duxiaoman-DI │ └── XuanYuan-70B │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── OpenAI │ ├── GPT3.5 │ │ ├── afqmc │ │ │ └── afqmc.json │ │ ├── arc-c │ │ │ └── arc-c.json │ │ ├── arc-e │ │ │ └── arc-e.json │ │ ├── ax-b │ │ │ └── ax-b.json │ │ ├── ax-g │ │ │ └── ax-g.json │ │ ├── bbh │ │ │ ├── boolean-expressions.json │ │ │ ├── causal-judgement.json │ │ │ ├── date-understanding.json │ │ │ ├── disambiguation-qa.json │ │ │ ├── dyck-languages.json │ │ │ ├── formal-fallacies.json │ │ │ ├── geometric-shapes.json │ │ │ ├── hyperbaton.json │ │ │ ├── logical-deduction-five-objects.json │ │ │ ├── logical-deduction-seven-objects.json │ │ │ ├── logical-deduction-three-objects.json │ │ │ ├── movie-recommendation.json │ │ │ ├── multistep-arithmetic-two.json │ │ │ ├── navigate.json │ │ │ ├── object-counting.json │ │ │ ├── penguins-in-a-table.json │ │ │ ├── reasoning-about-colored-objects.json │ │ │ ├── ruin-names.json │ │ │ ├── salient-translation-error-detection.json │ │ │ ├── snarks.json │ │ │ ├── sports-understanding.json │ │ │ ├── temporal-sequences.json │ │ │ ├── tracking-shuffled-objects-five-objects.json │ │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ │ ├── tracking-shuffled-objects-three-objects.json │ │ │ ├── web-of-lies.json │ │ │ └── word-sorting.json │ │ ├── boolq │ │ │ └── boolq.json │ │ ├── c3 │ │ │ ├── dialog.json │ │ │ └── mixed.json │ │ ├── ceval │ │ │ ├── accountant.json │ │ │ ├── advanced-mathematics.json │ │ │ ├── art-studies.json │ │ │ ├── basic-medicine.json │ │ │ ├── business-administration.json │ │ │ ├── chinese-language-and-literature.json │ │ │ ├── civil-servant.json │ │ │ ├── clinical-medicine.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-economics.json │ │ │ ├── college-physics.json │ │ │ ├── college-programming.json │ │ │ ├── computer-architecture.json │ │ │ ├── computer-network.json │ │ │ ├── discrete-mathematics.json │ │ │ ├── education-science.json │ │ │ ├── electrical-engineer.json │ │ │ ├── environmental-impact-assessment-engineer.json │ │ │ ├── fire-engineer.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-chinese.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-history.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── ideological-and-moral-cultivation.json │ │ │ ├── law.json │ │ │ ├── legal-professional.json │ │ │ ├── logic.json │ │ │ ├── mao-zedong-thought.json │ │ │ ├── marxism.json │ │ │ ├── metrology-engineer.json │ │ │ ├── middle-school-biology.json │ │ │ ├── middle-school-chemistry.json │ │ │ ├── middle-school-geography.json │ │ │ ├── middle-school-history.json │ │ │ ├── middle-school-mathematics.json │ │ │ ├── middle-school-physics.json │ │ │ ├── middle-school-politics.json │ │ │ ├── modern-chinese-history.json │ │ │ ├── operating-system.json │ │ │ ├── physician.json │ │ │ ├── plant-protection.json │ │ │ ├── probability-and-statistics.json │ │ │ ├── professional-tour-guide.json │ │ │ ├── sports-science.json │ │ │ ├── tax-accountant.json │ │ │ ├── teacher-qualification.json │ │ │ ├── urban-and-rural-planner.json │ │ │ └── veterinary-medicine.json │ │ ├── chid │ │ │ └── chid.json │ │ ├── cmmlu │ │ │ ├── agronomy.json │ │ │ ├── anatomy.json │ │ │ ├── ancient-chinese.json │ │ │ ├── arts.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── chinese-civil-service-exam.json │ │ │ ├── chinese-driving-rule.json │ │ │ ├── chinese-food-culture.json │ │ │ ├── chinese-foreign-policy.json │ │ │ ├── chinese-history.json │ │ │ ├── chinese-literature.json │ │ │ ├── chinese-teacher-qualification.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-actuarial-science.json │ │ │ ├── college-education.json │ │ │ ├── college-engineering-hydrology.json │ │ │ ├── college-law.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medical-statistics.json │ │ │ ├── college-medicine.json │ │ │ ├── computer-science.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── construction-project-management.json │ │ │ ├── economics.json │ │ │ ├── education.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-chinese.json │ │ │ ├── elementary-commonsense.json │ │ │ ├── elementary-information-and-technology.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── ethnology.json │ │ │ ├── food-science.json │ │ │ ├── genetics.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── journalism.json │ │ │ ├── jurisprudence.json │ │ │ ├── legal-and-moral-basis.json │ │ │ ├── logical.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── marxist-theory.json │ │ │ ├── modern-chinese.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-study.json │ │ │ ├── sociology.json │ │ │ ├── sports-science.json │ │ │ ├── traditional-chinese-medicine.json │ │ │ ├── virology.json │ │ │ ├── world-history.json │ │ │ └── world-religions.json │ │ ├── copa │ │ │ └── copa.json │ │ ├── eprstmt │ │ │ └── eprstmt.json │ │ ├── gsm8k │ │ │ └── gsm8k.json │ │ ├── hellaswag │ │ │ └── hellaswag.json │ │ ├── humaneval │ │ │ └── humaneval.json │ │ ├── lambada │ │ │ └── lambada.json │ │ ├── math │ │ │ ├── algebra.json │ │ │ ├── counting-and-probability.json │ │ │ ├── geometry.json │ │ │ ├── intermediate-algebra.json │ │ │ ├── number-theory.json │ │ │ ├── prealgebra.json │ │ │ └── precalculus.json │ │ ├── mbpp │ │ │ └── mbpp.json │ │ ├── mmlu │ │ │ ├── abstract-algebra.json │ │ │ ├── anatomy.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-biology.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-computer-science.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medicine.json │ │ │ ├── college-physics.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── econometrics.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── formal-logic.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-computer-science.json │ │ │ ├── high-school-european-history.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-government-and-politics.json │ │ │ ├── high-school-macroeconomics.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-microeconomics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-psychology.json │ │ │ ├── high-school-statistics.json │ │ │ ├── high-school-us-history.json │ │ │ ├── high-school-world-history.json │ │ │ ├── human-aging.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── jurisprudence.json │ │ │ ├── logical-fallacies.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── medical-genetics.json │ │ │ ├── miscellaneous.json │ │ │ ├── moral-disputes.json │ │ │ ├── moral-scenarios.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── prehistory.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-studies.json │ │ │ ├── sociology.json │ │ │ ├── us-foreign-policy.json │ │ │ ├── virology.json │ │ │ └── world-religions.json │ │ ├── ocnli │ │ │ └── ocnli.json │ │ ├── piqa │ │ │ └── piqa.json │ │ ├── rte │ │ │ └── rte.json │ │ ├── tydiqa │ │ │ └── tydiqa.json │ │ └── wic │ │ │ └── wic.json │ └── GPT4 │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── THUDM │ └── chatglm2-6b │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── TigerResearch │ └── tigerbot-13b-base-v1 │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── baichuan-inc │ └── Baichuan2-13B-Chat │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── internlm │ └── internlm-7b │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── lmsys │ └── vicuna-13b-v1.5 │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── meta-llama │ ├── Llama-2-13b-hf │ │ ├── afqmc │ │ │ └── afqmc.json │ │ ├── arc-c │ │ │ └── arc-c.json │ │ ├── arc-e │ │ │ └── arc-e.json │ │ ├── ax-b │ │ │ └── ax-b.json │ │ ├── ax-g │ │ │ └── ax-g.json │ │ ├── bbh │ │ │ ├── boolean-expressions.json │ │ │ ├── causal-judgement.json │ │ │ ├── date-understanding.json │ │ │ ├── disambiguation-qa.json │ │ │ ├── dyck-languages.json │ │ │ ├── formal-fallacies.json │ │ │ ├── geometric-shapes.json │ │ │ ├── hyperbaton.json │ │ │ ├── logical-deduction-five-objects.json │ │ │ ├── logical-deduction-seven-objects.json │ │ │ ├── logical-deduction-three-objects.json │ │ │ ├── movie-recommendation.json │ │ │ ├── multistep-arithmetic-two.json │ │ │ ├── navigate.json │ │ │ ├── object-counting.json │ │ │ ├── penguins-in-a-table.json │ │ │ ├── reasoning-about-colored-objects.json │ │ │ ├── ruin-names.json │ │ │ ├── salient-translation-error-detection.json │ │ │ ├── snarks.json │ │ │ ├── sports-understanding.json │ │ │ ├── temporal-sequences.json │ │ │ ├── tracking-shuffled-objects-five-objects.json │ │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ │ ├── tracking-shuffled-objects-three-objects.json │ │ │ ├── web-of-lies.json │ │ │ └── word-sorting.json │ │ ├── boolq │ │ │ └── boolq.json │ │ ├── c3 │ │ │ ├── dialog.json │ │ │ └── mixed.json │ │ ├── ceval │ │ │ ├── accountant.json │ │ │ ├── advanced-mathematics.json │ │ │ ├── art-studies.json │ │ │ ├── basic-medicine.json │ │ │ ├── business-administration.json │ │ │ ├── chinese-language-and-literature.json │ │ │ ├── civil-servant.json │ │ │ ├── clinical-medicine.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-economics.json │ │ │ ├── college-physics.json │ │ │ ├── college-programming.json │ │ │ ├── computer-architecture.json │ │ │ ├── computer-network.json │ │ │ ├── discrete-mathematics.json │ │ │ ├── education-science.json │ │ │ ├── electrical-engineer.json │ │ │ ├── environmental-impact-assessment-engineer.json │ │ │ ├── fire-engineer.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-chinese.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-history.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── ideological-and-moral-cultivation.json │ │ │ ├── law.json │ │ │ ├── legal-professional.json │ │ │ ├── logic.json │ │ │ ├── mao-zedong-thought.json │ │ │ ├── marxism.json │ │ │ ├── metrology-engineer.json │ │ │ ├── middle-school-biology.json │ │ │ ├── middle-school-chemistry.json │ │ │ ├── middle-school-geography.json │ │ │ ├── middle-school-history.json │ │ │ ├── middle-school-mathematics.json │ │ │ ├── middle-school-physics.json │ │ │ ├── middle-school-politics.json │ │ │ ├── modern-chinese-history.json │ │ │ ├── operating-system.json │ │ │ ├── physician.json │ │ │ ├── plant-protection.json │ │ │ ├── probability-and-statistics.json │ │ │ ├── professional-tour-guide.json │ │ │ ├── sports-science.json │ │ │ ├── tax-accountant.json │ │ │ ├── teacher-qualification.json │ │ │ ├── urban-and-rural-planner.json │ │ │ └── veterinary-medicine.json │ │ ├── chid │ │ │ └── chid.json │ │ ├── cmmlu │ │ │ ├── agronomy.json │ │ │ ├── anatomy.json │ │ │ ├── ancient-chinese.json │ │ │ ├── arts.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── chinese-civil-service-exam.json │ │ │ ├── chinese-driving-rule.json │ │ │ ├── chinese-food-culture.json │ │ │ ├── chinese-foreign-policy.json │ │ │ ├── chinese-history.json │ │ │ ├── chinese-literature.json │ │ │ ├── chinese-teacher-qualification.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-actuarial-science.json │ │ │ ├── college-education.json │ │ │ ├── college-engineering-hydrology.json │ │ │ ├── college-law.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medical-statistics.json │ │ │ ├── college-medicine.json │ │ │ ├── computer-science.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── construction-project-management.json │ │ │ ├── economics.json │ │ │ ├── education.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-chinese.json │ │ │ ├── elementary-commonsense.json │ │ │ ├── elementary-information-and-technology.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── ethnology.json │ │ │ ├── food-science.json │ │ │ ├── genetics.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── journalism.json │ │ │ ├── jurisprudence.json │ │ │ ├── legal-and-moral-basis.json │ │ │ ├── logical.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── marxist-theory.json │ │ │ ├── modern-chinese.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-study.json │ │ │ ├── sociology.json │ │ │ ├── sports-science.json │ │ │ ├── traditional-chinese-medicine.json │ │ │ ├── virology.json │ │ │ ├── world-history.json │ │ │ └── world-religions.json │ │ ├── copa │ │ │ └── copa.json │ │ ├── eprstmt │ │ │ └── eprstmt.json │ │ ├── gsm8k │ │ │ └── gsm8k.json │ │ ├── hellaswag │ │ │ └── hellaswag.json │ │ ├── humaneval │ │ │ └── humaneval.json │ │ ├── lambada │ │ │ └── lambada.json │ │ ├── math │ │ │ ├── algebra.json │ │ │ ├── counting-and-probability.json │ │ │ ├── geometry.json │ │ │ ├── intermediate-algebra.json │ │ │ ├── number-theory.json │ │ │ ├── prealgebra.json │ │ │ └── precalculus.json │ │ ├── mbpp │ │ │ └── mbpp.json │ │ ├── mmlu │ │ │ ├── abstract-algebra.json │ │ │ ├── anatomy.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-biology.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-computer-science.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medicine.json │ │ │ ├── college-physics.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── econometrics.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── formal-logic.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-computer-science.json │ │ │ ├── high-school-european-history.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-government-and-politics.json │ │ │ ├── high-school-macroeconomics.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-microeconomics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-psychology.json │ │ │ ├── high-school-statistics.json │ │ │ ├── high-school-us-history.json │ │ │ ├── high-school-world-history.json │ │ │ ├── human-aging.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── jurisprudence.json │ │ │ ├── logical-fallacies.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── medical-genetics.json │ │ │ ├── miscellaneous.json │ │ │ ├── moral-disputes.json │ │ │ ├── moral-scenarios.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── prehistory.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-studies.json │ │ │ ├── sociology.json │ │ │ ├── us-foreign-policy.json │ │ │ ├── virology.json │ │ │ └── world-religions.json │ │ ├── ocnli │ │ │ └── ocnli.json │ │ ├── piqa │ │ │ └── piqa.json │ │ ├── rte │ │ │ └── rte.json │ │ ├── tydiqa │ │ │ └── tydiqa.json │ │ └── wic │ │ │ └── wic.json │ ├── Llama-2-70b-chat-hf │ │ ├── afqmc │ │ │ └── afqmc.json │ │ ├── arc-c │ │ │ └── arc-c.json │ │ ├── arc-e │ │ │ └── arc-e.json │ │ ├── ax-b │ │ │ └── ax-b.json │ │ ├── ax-g │ │ │ └── ax-g.json │ │ ├── bbh │ │ │ ├── boolean-expressions.json │ │ │ ├── causal-judgement.json │ │ │ ├── date-understanding.json │ │ │ ├── disambiguation-qa.json │ │ │ ├── dyck-languages.json │ │ │ ├── formal-fallacies.json │ │ │ ├── geometric-shapes.json │ │ │ ├── hyperbaton.json │ │ │ ├── logical-deduction-five-objects.json │ │ │ ├── logical-deduction-seven-objects.json │ │ │ ├── logical-deduction-three-objects.json │ │ │ ├── movie-recommendation.json │ │ │ ├── multistep-arithmetic-two.json │ │ │ ├── navigate.json │ │ │ ├── object-counting.json │ │ │ ├── penguins-in-a-table.json │ │ │ ├── reasoning-about-colored-objects.json │ │ │ ├── ruin-names.json │ │ │ ├── salient-translation-error-detection.json │ │ │ ├── snarks.json │ │ │ ├── sports-understanding.json │ │ │ ├── temporal-sequences.json │ │ │ ├── tracking-shuffled-objects-five-objects.json │ │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ │ ├── tracking-shuffled-objects-three-objects.json │ │ │ ├── web-of-lies.json │ │ │ └── word-sorting.json │ │ ├── boolq │ │ │ └── boolq.json │ │ ├── c3 │ │ │ ├── dialog.json │ │ │ └── mixed.json │ │ ├── ceval │ │ │ ├── accountant.json │ │ │ ├── advanced-mathematics.json │ │ │ ├── art-studies.json │ │ │ ├── basic-medicine.json │ │ │ ├── business-administration.json │ │ │ ├── chinese-language-and-literature.json │ │ │ ├── civil-servant.json │ │ │ ├── clinical-medicine.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-economics.json │ │ │ ├── college-physics.json │ │ │ ├── college-programming.json │ │ │ ├── computer-architecture.json │ │ │ ├── computer-network.json │ │ │ ├── discrete-mathematics.json │ │ │ ├── education-science.json │ │ │ ├── electrical-engineer.json │ │ │ ├── environmental-impact-assessment-engineer.json │ │ │ ├── fire-engineer.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-chinese.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-history.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── ideological-and-moral-cultivation.json │ │ │ ├── law.json │ │ │ ├── legal-professional.json │ │ │ ├── logic.json │ │ │ ├── mao-zedong-thought.json │ │ │ ├── marxism.json │ │ │ ├── metrology-engineer.json │ │ │ ├── middle-school-biology.json │ │ │ ├── middle-school-chemistry.json │ │ │ ├── middle-school-geography.json │ │ │ ├── middle-school-history.json │ │ │ ├── middle-school-mathematics.json │ │ │ ├── middle-school-physics.json │ │ │ ├── middle-school-politics.json │ │ │ ├── modern-chinese-history.json │ │ │ ├── operating-system.json │ │ │ ├── physician.json │ │ │ ├── plant-protection.json │ │ │ ├── probability-and-statistics.json │ │ │ ├── professional-tour-guide.json │ │ │ ├── sports-science.json │ │ │ ├── tax-accountant.json │ │ │ ├── teacher-qualification.json │ │ │ ├── urban-and-rural-planner.json │ │ │ └── veterinary-medicine.json │ │ ├── chid │ │ │ └── chid.json │ │ ├── cmmlu │ │ │ ├── agronomy.json │ │ │ ├── anatomy.json │ │ │ ├── ancient-chinese.json │ │ │ ├── arts.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── chinese-civil-service-exam.json │ │ │ ├── chinese-driving-rule.json │ │ │ ├── chinese-food-culture.json │ │ │ ├── chinese-foreign-policy.json │ │ │ ├── chinese-history.json │ │ │ ├── chinese-literature.json │ │ │ ├── chinese-teacher-qualification.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-actuarial-science.json │ │ │ ├── college-education.json │ │ │ ├── college-engineering-hydrology.json │ │ │ ├── college-law.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medical-statistics.json │ │ │ ├── college-medicine.json │ │ │ ├── computer-science.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── construction-project-management.json │ │ │ ├── economics.json │ │ │ ├── education.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-chinese.json │ │ │ ├── elementary-commonsense.json │ │ │ ├── elementary-information-and-technology.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── ethnology.json │ │ │ ├── food-science.json │ │ │ ├── genetics.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── journalism.json │ │ │ ├── jurisprudence.json │ │ │ ├── legal-and-moral-basis.json │ │ │ ├── logical.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── marxist-theory.json │ │ │ ├── modern-chinese.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-study.json │ │ │ ├── sociology.json │ │ │ ├── sports-science.json │ │ │ ├── traditional-chinese-medicine.json │ │ │ ├── virology.json │ │ │ ├── world-history.json │ │ │ └── world-religions.json │ │ ├── copa │ │ │ └── copa.json │ │ ├── eprstmt │ │ │ └── eprstmt.json │ │ ├── gsm8k │ │ │ └── gsm8k.json │ │ ├── hellaswag │ │ │ └── hellaswag.json │ │ ├── humaneval │ │ │ └── humaneval.json │ │ ├── lambada │ │ │ └── lambada.json │ │ ├── math │ │ │ ├── algebra.json │ │ │ ├── counting-and-probability.json │ │ │ ├── geometry.json │ │ │ ├── intermediate-algebra.json │ │ │ ├── number-theory.json │ │ │ ├── prealgebra.json │ │ │ └── precalculus.json │ │ ├── mbpp │ │ │ └── mbpp.json │ │ ├── mmlu │ │ │ ├── abstract-algebra.json │ │ │ ├── anatomy.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-biology.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-computer-science.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medicine.json │ │ │ ├── college-physics.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── econometrics.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── formal-logic.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-computer-science.json │ │ │ ├── high-school-european-history.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-government-and-politics.json │ │ │ ├── high-school-macroeconomics.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-microeconomics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-psychology.json │ │ │ ├── high-school-statistics.json │ │ │ ├── high-school-us-history.json │ │ │ ├── high-school-world-history.json │ │ │ ├── human-aging.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── jurisprudence.json │ │ │ ├── logical-fallacies.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── medical-genetics.json │ │ │ ├── miscellaneous.json │ │ │ ├── moral-disputes.json │ │ │ ├── moral-scenarios.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── prehistory.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-studies.json │ │ │ ├── sociology.json │ │ │ ├── us-foreign-policy.json │ │ │ ├── virology.json │ │ │ └── world-religions.json │ │ ├── ocnli │ │ │ └── ocnli.json │ │ ├── piqa │ │ │ └── piqa.json │ │ ├── rte │ │ │ └── rte.json │ │ ├── tydiqa │ │ │ └── tydiqa.json │ │ └── wic │ │ │ └── wic.json │ ├── Llama-2-70b-hf │ │ ├── afqmc │ │ │ └── afqmc.json │ │ ├── arc-c │ │ │ └── arc-c.json │ │ ├── arc-e │ │ │ └── arc-e.json │ │ ├── ax-b │ │ │ └── ax-b.json │ │ ├── ax-g │ │ │ └── ax-g.json │ │ ├── bbh │ │ │ ├── boolean-expressions.json │ │ │ ├── causal-judgement.json │ │ │ ├── date-understanding.json │ │ │ ├── disambiguation-qa.json │ │ │ ├── dyck-languages.json │ │ │ ├── formal-fallacies.json │ │ │ ├── geometric-shapes.json │ │ │ ├── hyperbaton.json │ │ │ ├── logical-deduction-five-objects.json │ │ │ ├── logical-deduction-seven-objects.json │ │ │ ├── logical-deduction-three-objects.json │ │ │ ├── movie-recommendation.json │ │ │ ├── multistep-arithmetic-two.json │ │ │ ├── navigate.json │ │ │ ├── object-counting.json │ │ │ ├── penguins-in-a-table.json │ │ │ ├── reasoning-about-colored-objects.json │ │ │ ├── ruin-names.json │ │ │ ├── salient-translation-error-detection.json │ │ │ ├── snarks.json │ │ │ ├── sports-understanding.json │ │ │ ├── temporal-sequences.json │ │ │ ├── tracking-shuffled-objects-five-objects.json │ │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ │ ├── tracking-shuffled-objects-three-objects.json │ │ │ ├── web-of-lies.json │ │ │ └── word-sorting.json │ │ ├── boolq │ │ │ └── boolq.json │ │ ├── c3 │ │ │ ├── dialog.json │ │ │ └── mixed.json │ │ ├── ceval │ │ │ ├── accountant.json │ │ │ ├── advanced-mathematics.json │ │ │ ├── art-studies.json │ │ │ ├── basic-medicine.json │ │ │ ├── business-administration.json │ │ │ ├── chinese-language-and-literature.json │ │ │ ├── civil-servant.json │ │ │ ├── clinical-medicine.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-economics.json │ │ │ ├── college-physics.json │ │ │ ├── college-programming.json │ │ │ ├── computer-architecture.json │ │ │ ├── computer-network.json │ │ │ ├── discrete-mathematics.json │ │ │ ├── education-science.json │ │ │ ├── electrical-engineer.json │ │ │ ├── environmental-impact-assessment-engineer.json │ │ │ ├── fire-engineer.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-chinese.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-history.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── ideological-and-moral-cultivation.json │ │ │ ├── law.json │ │ │ ├── legal-professional.json │ │ │ ├── logic.json │ │ │ ├── mao-zedong-thought.json │ │ │ ├── marxism.json │ │ │ ├── metrology-engineer.json │ │ │ ├── middle-school-biology.json │ │ │ ├── middle-school-chemistry.json │ │ │ ├── middle-school-geography.json │ │ │ ├── middle-school-history.json │ │ │ ├── middle-school-mathematics.json │ │ │ ├── middle-school-physics.json │ │ │ ├── middle-school-politics.json │ │ │ ├── modern-chinese-history.json │ │ │ ├── operating-system.json │ │ │ ├── physician.json │ │ │ ├── plant-protection.json │ │ │ ├── probability-and-statistics.json │ │ │ ├── professional-tour-guide.json │ │ │ ├── sports-science.json │ │ │ ├── tax-accountant.json │ │ │ ├── teacher-qualification.json │ │ │ ├── urban-and-rural-planner.json │ │ │ └── veterinary-medicine.json │ │ ├── chid │ │ │ └── chid.json │ │ ├── cmmlu │ │ │ ├── agronomy.json │ │ │ ├── anatomy.json │ │ │ ├── ancient-chinese.json │ │ │ ├── arts.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── chinese-civil-service-exam.json │ │ │ ├── chinese-driving-rule.json │ │ │ ├── chinese-food-culture.json │ │ │ ├── chinese-foreign-policy.json │ │ │ ├── chinese-history.json │ │ │ ├── chinese-literature.json │ │ │ ├── chinese-teacher-qualification.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-actuarial-science.json │ │ │ ├── college-education.json │ │ │ ├── college-engineering-hydrology.json │ │ │ ├── college-law.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medical-statistics.json │ │ │ ├── college-medicine.json │ │ │ ├── computer-science.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── construction-project-management.json │ │ │ ├── economics.json │ │ │ ├── education.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-chinese.json │ │ │ ├── elementary-commonsense.json │ │ │ ├── elementary-information-and-technology.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── ethnology.json │ │ │ ├── food-science.json │ │ │ ├── genetics.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-politics.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── journalism.json │ │ │ ├── jurisprudence.json │ │ │ ├── legal-and-moral-basis.json │ │ │ ├── logical.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── marxist-theory.json │ │ │ ├── modern-chinese.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-study.json │ │ │ ├── sociology.json │ │ │ ├── sports-science.json │ │ │ ├── traditional-chinese-medicine.json │ │ │ ├── virology.json │ │ │ ├── world-history.json │ │ │ └── world-religions.json │ │ ├── copa │ │ │ └── copa.json │ │ ├── eprstmt │ │ │ └── eprstmt.json │ │ ├── gsm8k │ │ │ └── gsm8k.json │ │ ├── hellaswag │ │ │ └── hellaswag.json │ │ ├── humaneval │ │ │ └── humaneval.json │ │ ├── lambada │ │ │ └── lambada.json │ │ ├── math │ │ │ ├── algebra.json │ │ │ ├── counting-and-probability.json │ │ │ ├── geometry.json │ │ │ ├── intermediate-algebra.json │ │ │ ├── number-theory.json │ │ │ ├── prealgebra.json │ │ │ └── precalculus.json │ │ ├── mbpp │ │ │ └── mbpp.json │ │ ├── mmlu │ │ │ ├── abstract-algebra.json │ │ │ ├── anatomy.json │ │ │ ├── astronomy.json │ │ │ ├── business-ethics.json │ │ │ ├── clinical-knowledge.json │ │ │ ├── college-biology.json │ │ │ ├── college-chemistry.json │ │ │ ├── college-computer-science.json │ │ │ ├── college-mathematics.json │ │ │ ├── college-medicine.json │ │ │ ├── college-physics.json │ │ │ ├── computer-security.json │ │ │ ├── conceptual-physics.json │ │ │ ├── econometrics.json │ │ │ ├── electrical-engineering.json │ │ │ ├── elementary-mathematics.json │ │ │ ├── formal-logic.json │ │ │ ├── global-facts.json │ │ │ ├── high-school-biology.json │ │ │ ├── high-school-chemistry.json │ │ │ ├── high-school-computer-science.json │ │ │ ├── high-school-european-history.json │ │ │ ├── high-school-geography.json │ │ │ ├── high-school-government-and-politics.json │ │ │ ├── high-school-macroeconomics.json │ │ │ ├── high-school-mathematics.json │ │ │ ├── high-school-microeconomics.json │ │ │ ├── high-school-physics.json │ │ │ ├── high-school-psychology.json │ │ │ ├── high-school-statistics.json │ │ │ ├── high-school-us-history.json │ │ │ ├── high-school-world-history.json │ │ │ ├── human-aging.json │ │ │ ├── human-sexuality.json │ │ │ ├── international-law.json │ │ │ ├── jurisprudence.json │ │ │ ├── logical-fallacies.json │ │ │ ├── machine-learning.json │ │ │ ├── management.json │ │ │ ├── marketing.json │ │ │ ├── medical-genetics.json │ │ │ ├── miscellaneous.json │ │ │ ├── moral-disputes.json │ │ │ ├── moral-scenarios.json │ │ │ ├── nutrition.json │ │ │ ├── philosophy.json │ │ │ ├── prehistory.json │ │ │ ├── professional-accounting.json │ │ │ ├── professional-law.json │ │ │ ├── professional-medicine.json │ │ │ ├── professional-psychology.json │ │ │ ├── public-relations.json │ │ │ ├── security-studies.json │ │ │ ├── sociology.json │ │ │ ├── us-foreign-policy.json │ │ │ ├── virology.json │ │ │ └── world-religions.json │ │ ├── ocnli │ │ │ └── ocnli.json │ │ ├── piqa │ │ │ └── piqa.json │ │ ├── rte │ │ │ └── rte.json │ │ ├── tydiqa │ │ │ └── tydiqa.json │ │ └── wic │ │ │ └── wic.json │ └── Llama-2-7b-hf │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── mistralai │ └── Mistral-7B-v0.1 │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json ├── stabilityai │ └── StableBeluga2 │ │ ├── afqmc │ │ └── afqmc.json │ │ ├── arc-c │ │ └── arc-c.json │ │ ├── arc-e │ │ └── arc-e.json │ │ ├── ax-b │ │ └── ax-b.json │ │ ├── ax-g │ │ └── ax-g.json │ │ ├── bbh │ │ ├── boolean-expressions.json │ │ ├── causal-judgement.json │ │ ├── date-understanding.json │ │ ├── disambiguation-qa.json │ │ ├── dyck-languages.json │ │ ├── formal-fallacies.json │ │ ├── geometric-shapes.json │ │ ├── hyperbaton.json │ │ ├── logical-deduction-five-objects.json │ │ ├── logical-deduction-seven-objects.json │ │ ├── logical-deduction-three-objects.json │ │ ├── movie-recommendation.json │ │ ├── multistep-arithmetic-two.json │ │ ├── navigate.json │ │ ├── object-counting.json │ │ ├── penguins-in-a-table.json │ │ ├── reasoning-about-colored-objects.json │ │ ├── ruin-names.json │ │ ├── salient-translation-error-detection.json │ │ ├── snarks.json │ │ ├── sports-understanding.json │ │ ├── temporal-sequences.json │ │ ├── tracking-shuffled-objects-five-objects.json │ │ ├── tracking-shuffled-objects-seven-objects.json │ │ ├── tracking-shuffled-objects-three-objects.json │ │ ├── web-of-lies.json │ │ └── word-sorting.json │ │ ├── boolq │ │ └── boolq.json │ │ ├── c3 │ │ ├── dialog.json │ │ └── mixed.json │ │ ├── ceval │ │ ├── accountant.json │ │ ├── advanced-mathematics.json │ │ ├── art-studies.json │ │ ├── basic-medicine.json │ │ ├── business-administration.json │ │ ├── chinese-language-and-literature.json │ │ ├── civil-servant.json │ │ ├── clinical-medicine.json │ │ ├── college-chemistry.json │ │ ├── college-economics.json │ │ ├── college-physics.json │ │ ├── college-programming.json │ │ ├── computer-architecture.json │ │ ├── computer-network.json │ │ ├── discrete-mathematics.json │ │ ├── education-science.json │ │ ├── electrical-engineer.json │ │ ├── environmental-impact-assessment-engineer.json │ │ ├── fire-engineer.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-chinese.json │ │ ├── high-school-geography.json │ │ ├── high-school-history.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── ideological-and-moral-cultivation.json │ │ ├── law.json │ │ ├── legal-professional.json │ │ ├── logic.json │ │ ├── mao-zedong-thought.json │ │ ├── marxism.json │ │ ├── metrology-engineer.json │ │ ├── middle-school-biology.json │ │ ├── middle-school-chemistry.json │ │ ├── middle-school-geography.json │ │ ├── middle-school-history.json │ │ ├── middle-school-mathematics.json │ │ ├── middle-school-physics.json │ │ ├── middle-school-politics.json │ │ ├── modern-chinese-history.json │ │ ├── operating-system.json │ │ ├── physician.json │ │ ├── plant-protection.json │ │ ├── probability-and-statistics.json │ │ ├── professional-tour-guide.json │ │ ├── sports-science.json │ │ ├── tax-accountant.json │ │ ├── teacher-qualification.json │ │ ├── urban-and-rural-planner.json │ │ └── veterinary-medicine.json │ │ ├── chid │ │ └── chid.json │ │ ├── cmmlu │ │ ├── agronomy.json │ │ ├── anatomy.json │ │ ├── ancient-chinese.json │ │ ├── arts.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── chinese-civil-service-exam.json │ │ ├── chinese-driving-rule.json │ │ ├── chinese-food-culture.json │ │ ├── chinese-foreign-policy.json │ │ ├── chinese-history.json │ │ ├── chinese-literature.json │ │ ├── chinese-teacher-qualification.json │ │ ├── clinical-knowledge.json │ │ ├── college-actuarial-science.json │ │ ├── college-education.json │ │ ├── college-engineering-hydrology.json │ │ ├── college-law.json │ │ ├── college-mathematics.json │ │ ├── college-medical-statistics.json │ │ ├── college-medicine.json │ │ ├── computer-science.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── construction-project-management.json │ │ ├── economics.json │ │ ├── education.json │ │ ├── electrical-engineering.json │ │ ├── elementary-chinese.json │ │ ├── elementary-commonsense.json │ │ ├── elementary-information-and-technology.json │ │ ├── elementary-mathematics.json │ │ ├── ethnology.json │ │ ├── food-science.json │ │ ├── genetics.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-geography.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-physics.json │ │ ├── high-school-politics.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── journalism.json │ │ ├── jurisprudence.json │ │ ├── legal-and-moral-basis.json │ │ ├── logical.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── marxist-theory.json │ │ ├── modern-chinese.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-study.json │ │ ├── sociology.json │ │ ├── sports-science.json │ │ ├── traditional-chinese-medicine.json │ │ ├── virology.json │ │ ├── world-history.json │ │ └── world-religions.json │ │ ├── copa │ │ └── copa.json │ │ ├── eprstmt │ │ └── eprstmt.json │ │ ├── gsm8k │ │ └── gsm8k.json │ │ ├── hellaswag │ │ └── hellaswag.json │ │ ├── humaneval │ │ └── humaneval.json │ │ ├── lambada │ │ └── lambada.json │ │ ├── math │ │ ├── algebra.json │ │ ├── counting-and-probability.json │ │ ├── geometry.json │ │ ├── intermediate-algebra.json │ │ ├── number-theory.json │ │ ├── prealgebra.json │ │ └── precalculus.json │ │ ├── mbpp │ │ └── mbpp.json │ │ ├── mmlu │ │ ├── abstract-algebra.json │ │ ├── anatomy.json │ │ ├── astronomy.json │ │ ├── business-ethics.json │ │ ├── clinical-knowledge.json │ │ ├── college-biology.json │ │ ├── college-chemistry.json │ │ ├── college-computer-science.json │ │ ├── college-mathematics.json │ │ ├── college-medicine.json │ │ ├── college-physics.json │ │ ├── computer-security.json │ │ ├── conceptual-physics.json │ │ ├── econometrics.json │ │ ├── electrical-engineering.json │ │ ├── elementary-mathematics.json │ │ ├── formal-logic.json │ │ ├── global-facts.json │ │ ├── high-school-biology.json │ │ ├── high-school-chemistry.json │ │ ├── high-school-computer-science.json │ │ ├── high-school-european-history.json │ │ ├── high-school-geography.json │ │ ├── high-school-government-and-politics.json │ │ ├── high-school-macroeconomics.json │ │ ├── high-school-mathematics.json │ │ ├── high-school-microeconomics.json │ │ ├── high-school-physics.json │ │ ├── high-school-psychology.json │ │ ├── high-school-statistics.json │ │ ├── high-school-us-history.json │ │ ├── high-school-world-history.json │ │ ├── human-aging.json │ │ ├── human-sexuality.json │ │ ├── international-law.json │ │ ├── jurisprudence.json │ │ ├── logical-fallacies.json │ │ ├── machine-learning.json │ │ ├── management.json │ │ ├── marketing.json │ │ ├── medical-genetics.json │ │ ├── miscellaneous.json │ │ ├── moral-disputes.json │ │ ├── moral-scenarios.json │ │ ├── nutrition.json │ │ ├── philosophy.json │ │ ├── prehistory.json │ │ ├── professional-accounting.json │ │ ├── professional-law.json │ │ ├── professional-medicine.json │ │ ├── professional-psychology.json │ │ ├── public-relations.json │ │ ├── security-studies.json │ │ ├── sociology.json │ │ ├── us-foreign-policy.json │ │ ├── virology.json │ │ └── world-religions.json │ │ ├── ocnli │ │ └── ocnli.json │ │ ├── piqa │ │ └── piqa.json │ │ ├── rte │ │ └── rte.json │ │ ├── tydiqa │ │ └── tydiqa.json │ │ └── wic │ │ └── wic.json └── yulan-team │ └── YuLan-Chat-2-13b │ ├── afqmc │ └── afqmc.json │ ├── arc-c │ └── arc-c.json │ ├── arc-e │ └── arc-e.json │ ├── ax-b │ └── ax-b.json │ ├── ax-g │ └── ax-g.json │ ├── bbh │ ├── boolean-expressions.json │ ├── causal-judgement.json │ ├── date-understanding.json │ ├── disambiguation-qa.json │ ├── dyck-languages.json │ ├── formal-fallacies.json │ ├── geometric-shapes.json │ ├── hyperbaton.json │ ├── logical-deduction-five-objects.json │ ├── logical-deduction-seven-objects.json │ ├── logical-deduction-three-objects.json │ ├── movie-recommendation.json │ ├── multistep-arithmetic-two.json │ ├── navigate.json │ ├── object-counting.json │ ├── penguins-in-a-table.json │ ├── reasoning-about-colored-objects.json │ ├── ruin-names.json │ ├── salient-translation-error-detection.json │ ├── snarks.json │ ├── sports-understanding.json │ ├── temporal-sequences.json │ ├── tracking-shuffled-objects-five-objects.json │ ├── tracking-shuffled-objects-seven-objects.json │ ├── tracking-shuffled-objects-three-objects.json │ ├── web-of-lies.json │ └── word-sorting.json │ ├── boolq │ └── boolq.json │ ├── c3 │ ├── dialog.json │ └── mixed.json │ ├── ceval │ ├── accountant.json │ ├── advanced-mathematics.json │ ├── art-studies.json │ ├── basic-medicine.json │ ├── business-administration.json │ ├── chinese-language-and-literature.json │ ├── civil-servant.json │ ├── clinical-medicine.json │ ├── college-chemistry.json │ ├── college-economics.json │ ├── college-physics.json │ ├── college-programming.json │ ├── computer-architecture.json │ ├── computer-network.json │ ├── discrete-mathematics.json │ ├── education-science.json │ ├── electrical-engineer.json │ ├── environmental-impact-assessment-engineer.json │ ├── fire-engineer.json │ ├── high-school-biology.json │ ├── high-school-chemistry.json │ ├── high-school-chinese.json │ ├── high-school-geography.json │ ├── high-school-history.json │ ├── high-school-mathematics.json │ ├── high-school-physics.json │ ├── high-school-politics.json │ ├── ideological-and-moral-cultivation.json │ ├── law.json │ ├── legal-professional.json │ ├── logic.json │ ├── mao-zedong-thought.json │ ├── marxism.json │ ├── metrology-engineer.json │ ├── middle-school-biology.json │ ├── middle-school-chemistry.json │ ├── middle-school-geography.json │ ├── middle-school-history.json │ ├── middle-school-mathematics.json │ ├── middle-school-physics.json │ ├── middle-school-politics.json │ ├── modern-chinese-history.json │ ├── operating-system.json │ ├── physician.json │ ├── plant-protection.json │ ├── probability-and-statistics.json │ ├── professional-tour-guide.json │ ├── sports-science.json │ ├── tax-accountant.json │ ├── teacher-qualification.json │ ├── urban-and-rural-planner.json │ └── veterinary-medicine.json │ ├── chid │ └── chid.json │ ├── cmmlu │ ├── agronomy.json │ ├── anatomy.json │ ├── ancient-chinese.json │ ├── arts.json │ ├── astronomy.json │ ├── business-ethics.json │ ├── chinese-civil-service-exam.json │ ├── chinese-driving-rule.json │ ├── chinese-food-culture.json │ ├── chinese-foreign-policy.json │ ├── chinese-history.json │ ├── chinese-literature.json │ ├── chinese-teacher-qualification.json │ ├── clinical-knowledge.json │ ├── college-actuarial-science.json │ ├── college-education.json │ ├── college-engineering-hydrology.json │ ├── college-law.json │ ├── college-mathematics.json │ ├── college-medical-statistics.json │ ├── college-medicine.json │ ├── computer-science.json │ ├── computer-security.json │ ├── conceptual-physics.json │ ├── construction-project-management.json │ ├── economics.json │ ├── education.json │ ├── electrical-engineering.json │ ├── elementary-chinese.json │ ├── elementary-commonsense.json │ ├── elementary-information-and-technology.json │ ├── elementary-mathematics.json │ ├── ethnology.json │ ├── food-science.json │ ├── genetics.json │ ├── global-facts.json │ ├── high-school-biology.json │ ├── high-school-chemistry.json │ ├── high-school-geography.json │ ├── high-school-mathematics.json │ ├── high-school-physics.json │ ├── high-school-politics.json │ ├── human-sexuality.json │ ├── international-law.json │ ├── journalism.json │ ├── jurisprudence.json │ ├── legal-and-moral-basis.json │ ├── logical.json │ ├── machine-learning.json │ ├── management.json │ ├── marketing.json │ ├── marxist-theory.json │ ├── modern-chinese.json │ ├── nutrition.json │ ├── philosophy.json │ ├── professional-accounting.json │ ├── professional-law.json │ ├── professional-medicine.json │ ├── professional-psychology.json │ ├── public-relations.json │ ├── security-study.json │ ├── sociology.json │ ├── sports-science.json │ ├── traditional-chinese-medicine.json │ ├── virology.json │ ├── world-history.json │ └── world-religions.json │ ├── copa │ └── copa.json │ ├── eprstmt │ └── eprstmt.json │ ├── gsm8k │ └── gsm8k.json │ ├── hellaswag │ └── hellaswag.json │ ├── humaneval │ └── humaneval.json │ ├── lambada │ └── lambada.json │ ├── math │ ├── algebra.json │ ├── counting-and-probability.json │ ├── geometry.json │ ├── intermediate-algebra.json │ ├── number-theory.json │ ├── prealgebra.json │ └── precalculus.json │ ├── mbpp │ └── mbpp.json │ ├── mmlu │ ├── abstract-algebra.json │ ├── anatomy.json │ ├── astronomy.json │ ├── business-ethics.json │ ├── clinical-knowledge.json │ ├── college-biology.json │ ├── college-chemistry.json │ ├── college-computer-science.json │ ├── college-mathematics.json │ ├── college-medicine.json │ ├── college-physics.json │ ├── computer-security.json │ ├── conceptual-physics.json │ ├── econometrics.json │ ├── electrical-engineering.json │ ├── elementary-mathematics.json │ ├── formal-logic.json │ ├── global-facts.json │ ├── high-school-biology.json │ ├── high-school-chemistry.json │ ├── high-school-computer-science.json │ ├── high-school-european-history.json │ ├── high-school-geography.json │ ├── high-school-government-and-politics.json │ ├── high-school-macroeconomics.json │ ├── high-school-mathematics.json │ ├── high-school-microeconomics.json │ ├── high-school-physics.json │ ├── high-school-psychology.json │ ├── high-school-statistics.json │ ├── high-school-us-history.json │ ├── high-school-world-history.json │ ├── human-aging.json │ ├── human-sexuality.json │ ├── international-law.json │ ├── jurisprudence.json │ ├── logical-fallacies.json │ ├── machine-learning.json │ ├── management.json │ ├── marketing.json │ ├── medical-genetics.json │ ├── miscellaneous.json │ ├── moral-disputes.json │ ├── moral-scenarios.json │ ├── nutrition.json │ ├── philosophy.json │ ├── prehistory.json │ ├── professional-accounting.json │ ├── professional-law.json │ ├── professional-medicine.json │ ├── professional-psychology.json │ ├── public-relations.json │ ├── security-studies.json │ ├── sociology.json │ ├── us-foreign-policy.json │ ├── virology.json │ └── world-religions.json │ ├── ocnli │ └── ocnli.json │ ├── piqa │ └── piqa.json │ ├── rte │ └── rte.json │ ├── tydiqa │ └── tydiqa.json │ └── wic │ └── wic.json ├── main.py ├── metrics ├── __init__.py ├── aggregator.py ├── bleu.py ├── chrf.py ├── exact_match.py ├── f1_score.py ├── function_execution.py ├── gaokaobench_match.py ├── gpt4_eval.py ├── in_match.py ├── log_prob.py ├── log_prob_mc2.py ├── prefix_match.py ├── qa_match.py └── rouge.py ├── models ├── __init__.py ├── general_model.py ├── model_params │ ├── gpt-3.5-turbo.json │ ├── gpt-4.json │ ├── vllm_beamsearch.json │ ├── vllm_logprobs.json │ ├── vllm_sample.json │ ├── vllm_sample_bbh.json │ └── vllm_sample_v1.json └── openai_model.py ├── requirements.txt ├── run_eval.sh ├── scripts ├── run_job_base.sh ├── run_vllm.sh └── run_vllm_ppl.sh ├── setup.py ├── tasks ├── __init__.py ├── eval_task.py ├── instance.py ├── postprocess.py └── view_task.py └── utils ├── __init__.py ├── request.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | RawData/ 3 | **/data/ 4 | configs/eval_config.json 5 | UltraEval.egg-info/ 6 | build/ 7 | logs/ 8 | wip-* 9 | .idea/ 10 | RawData.zip 11 | logs*/ 12 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/__init__.py -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/datasets/__init__.py -------------------------------------------------------------------------------- /datasets/afqmc/config/afqmc_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "afqmc", 3 | "path": "datasets/afqmc/data/afqmc.jsonl", 4 | "description": "", 5 | "transform": "datasets/afqmc/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/afqmc/config/afqmc_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "afqmc", 3 | "path": "datasets/afqmc/data/afqmc.jsonl", 4 | "description": "", 5 | "transform": "datasets/afqmc/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/afqmc/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | "语句一:“" 7 | + data["passage"][0] 8 | + "\n语句二:“" 9 | + data["passage"][1] 10 | + "”\n语句一与语句二是关于蚂蚁金融产品的疑问,两者所询问的内容是否完全一致?" 11 | ) 12 | correct_answer = [ 13 | key for key, value in data["target_scores"].items() if value == 1 14 | ][0].strip() 15 | 16 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 17 | -------------------------------------------------------------------------------- /datasets/agieval/config/aqua-rat_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "aqua-rat", 3 | "path": "datasets/agieval/data/aqua-rat.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/aqua-rat_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "aqua-rat", 3 | "path": "datasets/agieval/data/aqua-rat.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-biology_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-biology", 3 | "path": "datasets/agieval/data/gaokao-biology.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-biology_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-biology", 3 | "path": "datasets/agieval/data/gaokao-biology.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-chemistry_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-chemistry", 3 | "path": "datasets/agieval/data/gaokao-chemistry.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-chemistry_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-chemistry", 3 | "path": "datasets/agieval/data/gaokao-chemistry.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-chinese_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-chinese", 3 | "path": "datasets/agieval/data/gaokao-chinese.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-chinese_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-chinese", 3 | "path": "datasets/agieval/data/gaokao-chinese.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-english_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-english", 3 | "path": "datasets/agieval/data/gaokao-english.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-english_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-english", 3 | "path": "datasets/agieval/data/gaokao-english.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-geography_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-geography", 3 | "path": "datasets/agieval/data/gaokao-geography.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-geography_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-geography", 3 | "path": "datasets/agieval/data/gaokao-geography.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-history_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-history", 3 | "path": "datasets/agieval/data/gaokao-history.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-history_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-history", 3 | "path": "datasets/agieval/data/gaokao-history.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-mathcloze_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-mathcloze", 3 | "path": "datasets/agieval/data/gaokao-mathcloze.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_cloze_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-mathqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-mathqa", 3 | "path": "datasets/agieval/data/gaokao-mathqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-mathqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-mathqa", 3 | "path": "datasets/agieval/data/gaokao-mathqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/gaokao-physics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gaokao-physics", 3 | "path": "datasets/agieval/data/gaokao-physics.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_multiple_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/jec-qa-ca_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "jec-qa-ca", 3 | "path": "datasets/agieval/data/jec-qa-ca.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_multiple_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/jec-qa-kd_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "jec-qa-kd", 3 | "path": "datasets/agieval/data/jec-qa-kd.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_multiple_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/logiqa-en_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logiqa-en", 3 | "path": "datasets/agieval/data/logiqa-en.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/logiqa-en_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logiqa-en", 3 | "path": "datasets/agieval/data/logiqa-en.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/logiqa-zh_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logiqa-zh", 3 | "path": "datasets/agieval/data/logiqa-zh.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/logiqa-zh_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logiqa-zh", 3 | "path": "datasets/agieval/data/logiqa-zh.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/lsat-ar_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lsat-ar", 3 | "path": "datasets/agieval/data/lsat-ar.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/lsat-ar_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lsat-ar", 3 | "path": "datasets/agieval/data/lsat-ar.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/lsat-lr_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lsat-lr", 3 | "path": "datasets/agieval/data/lsat-lr.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/lsat-lr_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lsat-lr", 3 | "path": "datasets/agieval/data/lsat-lr.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/lsat-rc_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lsat-rc", 3 | "path": "datasets/agieval/data/lsat-rc.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/lsat-rc_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lsat-rc", 3 | "path": "datasets/agieval/data/lsat-rc.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/math_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "math", 3 | "path": "datasets/agieval/data/math.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_cloze_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/sat-en-without-passage_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sat-en-without-passage", 3 | "path": "datasets/agieval/data/sat-en-without-passage.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/sat-en_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sat-en", 3 | "path": "datasets/agieval/data/sat-en.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/sat-en_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sat-en", 3 | "path": "datasets/agieval/data/sat-en.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/sat-math_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sat-math", 3 | "path": "datasets/agieval/data/sat-math.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "agieval_single_answer_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/agieval/config/sat-math_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sat-math", 3 | "path": "datasets/agieval/data/sat-math.jsonl", 4 | "description": "", 5 | "transform": "datasets/agieval/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/arc-c/config/arc-c_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "arc-c", 3 | "path": "datasets/arc-c/data/arc-c.jsonl", 4 | "description": "", 5 | "transform": "datasets/arc-c/transform_gen_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "qa_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/arc-c/config/arc-c_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "arc-c", 3 | "path": "datasets/arc-c/data/arc-c.jsonl", 4 | "description": "", 5 | "transform": "datasets/arc-c/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/arc-c/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = "Question: " + data["question"] + "\n" 6 | text += "Answer: " 7 | correct_answer = [ 8 | key for key, value in data["target_scores"].items() if value == 1 9 | ][0].strip() 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/arc-c/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/arc-e/config/arc-e_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "arc-e", 3 | "path": "datasets/arc-e/data/arc-e.jsonl", 4 | "description": "", 5 | "transform": "datasets/arc-e/transform_gen_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "qa_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/arc-e/config/arc-e_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "arc-e", 3 | "path": "datasets/arc-e/data/arc-e.jsonl", 4 | "description": "", 5 | "transform": "datasets/arc-e/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/arc-e/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = "Question: " + data["question"] + "\n" 6 | text += "Answer: " 7 | correct_answer = [ 8 | key for key, value in data["target_scores"].items() if value == 1 9 | ][0].strip() 10 | 11 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 12 | -------------------------------------------------------------------------------- /datasets/arc-e/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/ax-b/config/ax-b_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ax-b", 3 | "path": "datasets/ax-b/data/ax-b.jsonl", 4 | "description": "", 5 | "transform": "datasets/ax-b/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ax-b/config/ax-b_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ax-b", 3 | "path": "datasets/ax-b/data/ax-b.jsonl", 4 | "description": "", 5 | "transform": "datasets/ax-b/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ax-b/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage'][0]}\n{data['passage'][1]}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/ax-b/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | data["passage"][0] 7 | + "\n" 8 | + data["passage"][1] 9 | + "\nIs the sentence below entailed by the sentence above?\n" 10 | ) 11 | correct_answer = [ 12 | key for key, value in data["target_scores"].items() if value == 1 13 | ][0].strip() 14 | 15 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 16 | -------------------------------------------------------------------------------- /datasets/ax-g/config/ax-g_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ax-g", 3 | "path": "datasets/ax-g/data/ax-g.jsonl", 4 | "description": "", 5 | "transform": "datasets/ax-g/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ax-g/config/ax-g_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ax-g", 3 | "path": "datasets/ax-g/data/ax-g.jsonl", 4 | "description": "", 5 | "transform": "datasets/ax-g/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ax-g/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage'][0]}\n{data['passage'][1]}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/ax-g/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | data["passage"][0] 7 | + "\n" 8 | + data["passage"][1] 9 | + "\nIs the sentence below entailed by the sentence above?\n" 10 | ) 11 | correct_answer = [ 12 | key for key, value in data["target_scores"].items() if value == 1 13 | ][0].strip() 14 | 15 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 16 | -------------------------------------------------------------------------------- /datasets/bbh-cot/config/hyperbaton_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "hyperbaton", 3 | "path": "datasets/bbh-cot/data/hyperbaton.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh-cot/transform_gen_cot.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_bbh.json" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "in_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh-cot/config/navigate_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "navigate", 3 | "path": "datasets/bbh-cot/data/navigate.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh-cot/transform_gen_cot.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_bbh.json" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "in_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh-cot/config/ruin-names_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ruin-names", 3 | "path": "datasets/bbh-cot/data/ruin-names.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh-cot/transform_gen_cot.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_bbh.json" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "in_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh-cot/config/snarks_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "snarks", 3 | "path": "datasets/bbh-cot/data/snarks.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh-cot/transform_gen_cot.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_bbh.json" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "in_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh-cot/config/web-of-lies_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "web-of-lies", 3 | "path": "datasets/bbh-cot/data/web-of-lies.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh-cot/transform_gen_cot.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_bbh.json" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "in_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh-cot/config/word-sorting_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "word-sorting", 3 | "path": "datasets/bbh-cot/data/word-sorting.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh-cot/transform_gen_cot.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_bbh.json" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "in_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/boolean-expressions_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "boolean-expressions", 3 | "path": "datasets/bbh/data/boolean-expressions.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/causal-judgement_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "causal-judgement", 3 | "path": "datasets/bbh/data/causal-judgement.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/date-understanding_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "date-understanding", 3 | "path": "datasets/bbh/data/date-understanding.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/disambiguation-qa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "disambiguation-qa", 3 | "path": "datasets/bbh/data/disambiguation-qa.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/dyck-languages_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "dyck-languages", 3 | "path": "datasets/bbh/data/dyck-languages.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/formal-fallacies_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "formal-fallacies", 3 | "path": "datasets/bbh/data/formal-fallacies.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/geometric-shapes_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "geometric-shapes", 3 | "path": "datasets/bbh/data/geometric-shapes.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/hyperbaton_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "hyperbaton", 3 | "path": "datasets/bbh/data/hyperbaton.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/logical-deduction-five-objects_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logical-deduction-five-objects", 3 | "path": "datasets/bbh/data/logical-deduction-five-objects.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/logical-deduction-seven-objects_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logical-deduction-seven-objects", 3 | "path": "datasets/bbh/data/logical-deduction-seven-objects.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/logical-deduction-three-objects_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logical-deduction-three-objects", 3 | "path": "datasets/bbh/data/logical-deduction-three-objects.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/movie-recommendation_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "movie-recommendation", 3 | "path": "datasets/bbh/data/movie-recommendation.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/multistep-arithmetic-two_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "multistep-arithmetic-two", 3 | "path": "datasets/bbh/data/multistep-arithmetic-two.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/navigate_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "navigate", 3 | "path": "datasets/bbh/data/navigate.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/object-counting_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "object-counting", 3 | "path": "datasets/bbh/data/object-counting.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/penguins-in-a-table_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "penguins-in-a-table", 3 | "path": "datasets/bbh/data/penguins-in-a-table.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/reasoning-about-colored-objects_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "reasoning-about-colored-objects", 3 | "path": "datasets/bbh/data/reasoning-about-colored-objects.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/ruin-names_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ruin-names", 3 | "path": "datasets/bbh/data/ruin-names.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/snarks_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "snarks", 3 | "path": "datasets/bbh/data/snarks.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/sports-understanding_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sports-understanding", 3 | "path": "datasets/bbh/data/sports-understanding.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/temporal-sequences_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "temporal-sequences", 3 | "path": "datasets/bbh/data/temporal-sequences.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/web-of-lies_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "web-of-lies", 3 | "path": "datasets/bbh/data/web-of-lies.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bbh/config/word-sorting_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "word-sorting", 3 | "path": "datasets/bbh/data/word-sorting.jsonl", 4 | "description": "", 5 | "transform": "datasets/bbh/transform_gen_v0.py", 6 | "fewshot": 3, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "bbh_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/boolq/config/boolq_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "boolq", 3 | "path": "datasets/boolq/data/boolq.jsonl", 4 | "description": "", 5 | "transform": "datasets/boolq/transform_gen_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "qa_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/boolq/config/boolq_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "boolq", 3 | "path": "datasets/boolq/data/boolq.jsonl", 4 | "description": "", 5 | "transform": "datasets/boolq/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/boolq/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage']}\nQuestion: {data['question']}\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/boolq/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage']}\nQuestion: {data['question']}?\nAnswer: " 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/bustm/config/bustm_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "bustm", 3 | "path": "datasets/bustm/data/bustm.jsonl", 4 | "description": "", 5 | "transform": "datasets/bustm/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bustm/config/bustm_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "bustm", 3 | "path": "datasets/bustm/data/bustm.jsonl", 4 | "description": "", 5 | "transform": "datasets/bustm/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/bustm/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"语句一:“{data['passage'][0]}”\n语句二:“{data['passage'][1]}”\n请判断语句一和语句二说的是否是一个意思?\nA. 相关\nB. 无关\n请从“A”,“B”中进行选择。\n答:" 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/bustm/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | "语句一:“" 7 | + data["passage"][0] 8 | + "”\n语句二:“" 9 | + data["passage"][1] 10 | + "”\n请判断语句一和语句二说的是否是一个意思?" 11 | ) 12 | correct_answer = [ 13 | key for key, value in data["target_scores"].items() if value == 1 14 | ][0].strip() 15 | 16 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 17 | -------------------------------------------------------------------------------- /datasets/c3/config/dialog_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "dialog", 3 | "path": "datasets/c3/data/dialog.jsonl", 4 | "description": "", 5 | "transform": "datasets/c3/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/c3/config/dialog_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "dialog", 3 | "path": "datasets/c3/data/dialog.jsonl", 4 | "description": "", 5 | "transform": "datasets/c3/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/c3/config/mixed_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "mixed", 3 | "path": "datasets/c3/data/mixed.jsonl", 4 | "description": "", 5 | "transform": "datasets/c3/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/c3/config/mixed_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "mixed", 3 | "path": "datasets/c3/data/mixed.jsonl", 4 | "description": "", 5 | "transform": "datasets/c3/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/c3/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | documents_text = "\n".join(data["passage"]) 6 | text = f"文章:{documents_text}\n问题:{data['question']}\n答案:" 7 | correct_answer = [ 8 | key for key, value in data["target_scores"].items() if value == 1 9 | ][0].strip() 10 | 11 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 12 | -------------------------------------------------------------------------------- /datasets/cb/config/cb_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "cb", 3 | "path": "datasets/cb/data/cb.jsonl", 4 | "description": "", 5 | "transform": "datasets/cb/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cb/config/cb_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "cb", 3 | "path": "datasets/cb/data/cb.jsonl", 4 | "description": "", 5 | "transform": "datasets/cb/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cb/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage'][0]}\n{data['passage'][1]}\nWhat is the relation between the two sentences?\nA. Contradiction\nB. Entailment\nC. Neutral\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B", "C"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/cb/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | data["passage"][0] 7 | + "\n" 8 | + data["passage"][1] 9 | + "\nWhat is the relation between the two sentences? " 10 | ) 11 | correct_answer = [ 12 | key for key, value in data["target_scores"].items() if value == 1 13 | ][0].strip() 14 | 15 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 16 | -------------------------------------------------------------------------------- /datasets/ceval/config/accountant_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "accountant", 3 | "path": "datasets/ceval/data/accountant.jsonl", 4 | "description": "以下是中国关于会计考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/art-studies_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "art-studies", 3 | "path": "datasets/ceval/data/art-studies.jsonl", 4 | "description": "以下是中国关于艺术研究考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/basic-medicine_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "basic-medicine", 3 | "path": "datasets/ceval/data/basic-medicine.jsonl", 4 | "description": "以下是中国关于基础医学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/civil-servant_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "civil-servant", 3 | "path": "datasets/ceval/data/civil-servant.jsonl", 4 | "description": "以下是中国关于公务员考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/clinical-medicine_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "clinical-medicine", 3 | "path": "datasets/ceval/data/clinical-medicine.jsonl", 4 | "description": "以下是中国关于临床医学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/college-chemistry_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-chemistry", 3 | "path": "datasets/ceval/data/college-chemistry.jsonl", 4 | "description": "以下是中国关于大学化学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/college-economics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-economics", 3 | "path": "datasets/ceval/data/college-economics.jsonl", 4 | "description": "以下是中国关于大学经济学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/college-physics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-physics", 3 | "path": "datasets/ceval/data/college-physics.jsonl", 4 | "description": "以下是中国关于大学物理考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/college-programming_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-programming", 3 | "path": "datasets/ceval/data/college-programming.jsonl", 4 | "description": "以下是中国关于大学编程考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/computer-network_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "computer-network", 3 | "path": "datasets/ceval/data/computer-network.jsonl", 4 | "description": "以下是中国关于计算机网络考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/education-science_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "education-science", 3 | "path": "datasets/ceval/data/education-science.jsonl", 4 | "description": "以下是中国关于教育科学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/electrical-engineer_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "electrical-engineer", 3 | "path": "datasets/ceval/data/electrical-engineer.jsonl", 4 | "description": "以下是中国关于电气工程师考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/fire-engineer_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "fire-engineer", 3 | "path": "datasets/ceval/data/fire-engineer.jsonl", 4 | "description": "以下是中国关于消防工程师考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/high-school-biology_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "high-school-biology", 3 | "path": "datasets/ceval/data/high-school-biology.jsonl", 4 | "description": "以下是中国关于高中生物考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/high-school-chinese_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "high-school-chinese", 3 | "path": "datasets/ceval/data/high-school-chinese.jsonl", 4 | "description": "以下是中国关于高中语文考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/high-school-history_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "high-school-history", 3 | "path": "datasets/ceval/data/high-school-history.jsonl", 4 | "description": "以下是中国关于高中历史考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/high-school-physics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "high-school-physics", 3 | "path": "datasets/ceval/data/high-school-physics.jsonl", 4 | "description": "以下是中国关于高中物理考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/law_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "law", 3 | "path": "datasets/ceval/data/law.jsonl", 4 | "description": "以下是中国关于法律考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/legal-professional_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "legal-professional", 3 | "path": "datasets/ceval/data/legal-professional.jsonl", 4 | "description": "以下是中国关于法律专业人员考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/logic_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logic", 3 | "path": "datasets/ceval/data/logic.jsonl", 4 | "description": "以下是中国关于逻辑考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/mao-zedong-thought_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "mao-zedong-thought", 3 | "path": "datasets/ceval/data/mao-zedong-thought.jsonl", 4 | "description": "以下是中国关于毛泽东思想考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/marxism_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "marxism", 3 | "path": "datasets/ceval/data/marxism.jsonl", 4 | "description": "以下是中国关于马克思主义考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/metrology-engineer_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "metrology-engineer", 3 | "path": "datasets/ceval/data/metrology-engineer.jsonl", 4 | "description": "以下是中国关于计量工程师考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/operating-system_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "operating-system", 3 | "path": "datasets/ceval/data/operating-system.jsonl", 4 | "description": "以下是中国关于操作系统考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/physician_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "physician", 3 | "path": "datasets/ceval/data/physician.jsonl", 4 | "description": "以下是中国关于医生考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/plant-protection_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "plant-protection", 3 | "path": "datasets/ceval/data/plant-protection.jsonl", 4 | "description": "以下是中国关于植物保护考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/sports-science_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sports-science", 3 | "path": "datasets/ceval/data/sports-science.jsonl", 4 | "description": "以下是中国关于运动科学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/tax-accountant_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "tax-accountant", 3 | "path": "datasets/ceval/data/tax-accountant.jsonl", 4 | "description": "以下是中国关于税务会计考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ceval/config/veterinary-medicine_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "veterinary-medicine", 3 | "path": "datasets/ceval/data/veterinary-medicine.jsonl", 4 | "description": "以下是中国关于兽医学考试的单项选择题,请选出其中的正确答案。", 5 | "transform": "datasets/ceval/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/chid/config/chid_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "chid", 3 | "path": "datasets/chid/data/chid.jsonl", 4 | "description": "", 5 | "transform": "datasets/chid/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/chid/config/chid_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "chid", 3 | "path": "datasets/chid/data/chid.jsonl", 4 | "description": "", 5 | "transform": "datasets/chid/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/chid/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = "以下句子是否通顺?\n" + data["passage"] 6 | correct_answer = [ 7 | key + "这句话是通顺的。" for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/chid/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = data["passage"] 6 | processed_correct_answer = correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | return { 10 | "input": text, 11 | "output": correct_answer, 12 | "processed_output": processed_correct_answer, 13 | } 14 | -------------------------------------------------------------------------------- /datasets/cluewsc/config/cluewsc_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "cluewsc", 3 | "path": "datasets/cluewsc/data/cluewsc.jsonl", 4 | "description": "", 5 | "transform": "datasets/cluewsc/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cluewsc/config/cluewsc_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "cluewsc", 3 | "path": "datasets/cluewsc/data/cluewsc.jsonl", 4 | "description": "", 5 | "transform": "datasets/cluewsc/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cluewsc/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage']}\n此处,“{data['question'][1]}”是否指代“{data['question'][0]}“?\nA. 是\nB. 否\n请从”A“,”B“中进行选择。\n答:" 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/cluewsc/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | data["passage"] 7 | + "\n此处,代词“" 8 | + data["question"][1] 9 | + "”被用于指代“" 10 | + data["question"][0] 11 | + "”吗?请回答是或者否。" 12 | ) 13 | correct_answer = [ 14 | key for key, value in data["target_scores"].items() if value == 1 15 | ][0].strip() 16 | 17 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 18 | -------------------------------------------------------------------------------- /datasets/cmmlu/config/agronomy_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "agronomy", 3 | "path": "datasets/cmmlu/data/agronomy.jsonl", 4 | "description": "以下是关于农学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/anatomy_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "anatomy", 3 | "path": "datasets/cmmlu/data/anatomy.jsonl", 4 | "description": "以下是关于解剖学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/ancient-chinese_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ancient-chinese", 3 | "path": "datasets/cmmlu/data/ancient-chinese.jsonl", 4 | "description": "以下是关于古代汉语的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/arts_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "arts", 3 | "path": "datasets/cmmlu/data/arts.jsonl", 4 | "description": "以下是关于艺术的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/astronomy_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "astronomy", 3 | "path": "datasets/cmmlu/data/astronomy.jsonl", 4 | "description": "以下是关于天文学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/business-ethics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "business-ethics", 3 | "path": "datasets/cmmlu/data/business-ethics.jsonl", 4 | "description": "以下是关于商业伦理的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/chinese-history_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "chinese-history", 3 | "path": "datasets/cmmlu/data/chinese-history.jsonl", 4 | "description": "以下是关于中国历史的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/chinese-literature_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "chinese-literature", 3 | "path": "datasets/cmmlu/data/chinese-literature.jsonl", 4 | "description": "以下是关于中国文学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/clinical-knowledge_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "clinical-knowledge", 3 | "path": "datasets/cmmlu/data/clinical-knowledge.jsonl", 4 | "description": "以下是关于临床知识的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/college-education_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-education", 3 | "path": "datasets/cmmlu/data/college-education.jsonl", 4 | "description": "以下是关于大学教育的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/college-law_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-law", 3 | "path": "datasets/cmmlu/data/college-law.jsonl", 4 | "description": "以下是关于大学法律的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/college-mathematics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-mathematics", 3 | "path": "datasets/cmmlu/data/college-mathematics.jsonl", 4 | "description": "以下是关于大学数学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/college-medicine_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "college-medicine", 3 | "path": "datasets/cmmlu/data/college-medicine.jsonl", 4 | "description": "以下是关于大学医学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/computer-science_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "computer-science", 3 | "path": "datasets/cmmlu/data/computer-science.jsonl", 4 | "description": "以下是关于计算机科学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/computer-security_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "computer-security", 3 | "path": "datasets/cmmlu/data/computer-security.jsonl", 4 | "description": "以下是关于计算机安全的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/conceptual-physics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "conceptual-physics", 3 | "path": "datasets/cmmlu/data/conceptual-physics.jsonl", 4 | "description": "以下是关于概念物理学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/economics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "economics", 3 | "path": "datasets/cmmlu/data/economics.jsonl", 4 | "description": "以下是关于经济学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/education_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "education", 3 | "path": "datasets/cmmlu/data/education.jsonl", 4 | "description": "以下是关于教育学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/elementary-chinese_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "elementary-chinese", 3 | "path": "datasets/cmmlu/data/elementary-chinese.jsonl", 4 | "description": "以下是关于初级汉语的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/ethnology_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ethnology", 3 | "path": "datasets/cmmlu/data/ethnology.jsonl", 4 | "description": "以下是关于民族学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/food-science_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "food-science", 3 | "path": "datasets/cmmlu/data/food-science.jsonl", 4 | "description": "以下是关于食品科学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/genetics_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "genetics", 3 | "path": "datasets/cmmlu/data/genetics.jsonl", 4 | "description": "以下是关于遗传学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/global-facts_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "global-facts", 3 | "path": "datasets/cmmlu/data/global-facts.jsonl", 4 | "description": "以下是关于全球事实的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/human-sexuality_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "human-sexuality", 3 | "path": "datasets/cmmlu/data/human-sexuality.jsonl", 4 | "description": "以下是关于人类性学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/international-law_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "international-law", 3 | "path": "datasets/cmmlu/data/international-law.jsonl", 4 | "description": "以下是关于国际法的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/journalism_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "journalism", 3 | "path": "datasets/cmmlu/data/journalism.jsonl", 4 | "description": "以下是关于新闻学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/jurisprudence_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "jurisprudence", 3 | "path": "datasets/cmmlu/data/jurisprudence.jsonl", 4 | "description": "以下是关于法学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/logical_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "logical", 3 | "path": "datasets/cmmlu/data/logical.jsonl", 4 | "description": "以下是关于逻辑的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/machine-learning_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "machine-learning", 3 | "path": "datasets/cmmlu/data/machine-learning.jsonl", 4 | "description": "以下是关于机器学习的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/management_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "management", 3 | "path": "datasets/cmmlu/data/management.jsonl", 4 | "description": "以下是关于管理学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/marketing_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "marketing", 3 | "path": "datasets/cmmlu/data/marketing.jsonl", 4 | "description": "以下是关于市场营销的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/marxist-theory_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "marxist-theory", 3 | "path": "datasets/cmmlu/data/marxist-theory.jsonl", 4 | "description": "以下是关于马克思主义理论的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/modern-chinese_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "modern-chinese", 3 | "path": "datasets/cmmlu/data/modern-chinese.jsonl", 4 | "description": "以下是关于现代汉语的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/nutrition_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "nutrition", 3 | "path": "datasets/cmmlu/data/nutrition.jsonl", 4 | "description": "以下是关于营养学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/philosophy_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "philosophy", 3 | "path": "datasets/cmmlu/data/philosophy.jsonl", 4 | "description": "以下是关于哲学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/professional-law_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "professional-law", 3 | "path": "datasets/cmmlu/data/professional-law.jsonl", 4 | "description": "以下是关于专业法律的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/public-relations_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "public-relations", 3 | "path": "datasets/cmmlu/data/public-relations.jsonl", 4 | "description": "以下是关于公共关系的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/security-study_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "security-study", 3 | "path": "datasets/cmmlu/data/security-study.jsonl", 4 | "description": "以下是关于安全研究的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/sociology_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sociology", 3 | "path": "datasets/cmmlu/data/sociology.jsonl", 4 | "description": "以下是关于社会学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/sports-science_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sports-science", 3 | "path": "datasets/cmmlu/data/sports-science.jsonl", 4 | "description": "以下是关于运动科学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/virology_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "virology", 3 | "path": "datasets/cmmlu/data/virology.jsonl", 4 | "description": "以下是关于病毒学的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/world-history_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "world-history", 3 | "path": "datasets/cmmlu/data/world-history.jsonl", 4 | "description": "以下是关于世界历史的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmmlu/config/world-religions_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "world-religions", 3 | "path": "datasets/cmmlu/data/world-religions.jsonl", 4 | "description": "以下是关于世界宗教的单项选择题,请直接给出正确答案的选项。", 5 | "transform": "datasets/cmmlu/transform_gen_v1.py", 6 | "fewshot": 5, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmnli/config/cmnli_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "cmnli", 3 | "path": "datasets/cmnli/data/cmnli.jsonl", 4 | "description": "", 5 | "transform": "datasets/cmnli/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmnli/config/cmnli_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "cmnli", 3 | "path": "datasets/cmnli/data/cmnli.jsonl", 4 | "description": "", 5 | "transform": "datasets/cmnli/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/cmnli/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"语句一:“{data['passage'][0]}”\n语句二:“{data['passage'][1]}”\n请问这两句话是什么关系?\nA. 矛盾\nB. 无关\nC. 蕴含\n请从“A”,“B”,“C”中进行选择。\n答:" 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B", "C"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/cmnli/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | "语句一:“" 7 | + data["passage"][0] 8 | + "”\n语句二:“" 9 | + data["passage"][1] 10 | + "”\n请问这两句话是什么关系?" 11 | ) 12 | correct_answer = [ 13 | key for key, value in data["target_scores"].items() if value == 1 14 | ][0].strip() 15 | 16 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 17 | -------------------------------------------------------------------------------- /datasets/commonsenseqa/config/commonsenseqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "commonsenseqa", 3 | "path": "datasets/commonsenseqa/data/commonsenseqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/commonsenseqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/commonsenseqa/config/commonsenseqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "commonsenseqa", 3 | "path": "datasets/commonsenseqa/data/commonsenseqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/commonsenseqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/commonsenseqa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = "Question: " + data["question"] + "\n" 6 | text += "Answer: " 7 | 8 | correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | 12 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 13 | -------------------------------------------------------------------------------- /datasets/commonsenseqa/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/copa/config/copa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "copa", 3 | "path": "datasets/copa/data/copa.jsonl", 4 | "description": "", 5 | "transform": "datasets/copa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/copa/config/copa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "copa", 3 | "path": "datasets/copa/data/copa.jsonl", 4 | "description": "", 5 | "transform": "datasets/copa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/copa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | data["passage"] 7 | + "\nQuestion: What may be the " 8 | + data["question"] 9 | + "?\nAnswer: " 10 | ) 11 | correct_answer = [ 12 | key for key, value in data["target_scores"].items() if value == 1 13 | ][0].strip() 14 | 15 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 16 | -------------------------------------------------------------------------------- /datasets/drcd/config/drcd_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "drcd", 3 | "path": "datasets/drcd/data/drcd.jsonl", 4 | "description": "", 5 | "transform": "datasets/drcd/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/eprstmt/config/eprstmt_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "eprstmt", 3 | "path": "datasets/eprstmt/data/eprstmt.jsonl", 4 | "description": "", 5 | "transform": "datasets/eprstmt/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/eprstmt/config/eprstmt_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "eprstmt", 3 | "path": "datasets/eprstmt/data/eprstmt.jsonl", 4 | "description": "", 5 | "transform": "datasets/eprstmt/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/eprstmt/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"内容: {data['passage']}。请对上述内容进行情绪分类。\nA. 消极\nB. 积极\n请从”A“,”B“中进行选择。\n答:" 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/eprstmt/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = '内容: "' + data["passage"] + '"。情绪分类:' 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/flores/config/flores_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "flores", 3 | "path": "datasets/flores/data/flores.jsonl", 4 | "description": "", 5 | "transform": "datasets/flores/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "until_return_post", 12 | "metric": { 13 | "bleu-4": { 14 | "evaluation": { 15 | "type": "bleu", 16 | "tokenizer": "char" 17 | } 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /datasets/flores/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"Translate the following English statements to Chinese (Simpl).\nSource: {data['passage'][0]}\nTarget: " 6 | 7 | return { 8 | "input": text, 9 | "output": data["passage"][1], 10 | "processed_output": data["passage"][1], 11 | } 12 | -------------------------------------------------------------------------------- /datasets/gsm8k/config/gsm8k_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "gsm8k", 3 | "path": "datasets/gsm8k/data/gsm8k.jsonl", 4 | "description": "", 5 | "transform": "datasets/gsm8k/transform_gen_v0.py", 6 | "fewshot": 8, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_v1.json" 10 | }, 11 | "postprocess": "gsm8k_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/gsm8k/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from UltraEval.tasks.postprocess import GSM8KPost 4 | 5 | 6 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 7 | text = f"Question: {data['question']}\nAnswer: " 8 | correct_answer = data["answer"] 9 | gsm8kp = GSM8KPost() 10 | _, processed_correct_answer = gsm8kp([], correct_answer) 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/hellaswag/config/hellaswag_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "hellaswag", 3 | "path": "datasets/hellaswag/data/hellaswag.jsonl", 4 | "description": "", 5 | "transform": "datasets/hellaswag/transform_gen_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "qa_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/hellaswag/config/hellaswag_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "hellaswag", 3 | "path": "datasets/hellaswag/data/hellaswag.jsonl", 4 | "description": "", 5 | "transform": "datasets/hellaswag/transform_ppl_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "general_torch_ppl_norm", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/hellaswag/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = data["question"] + " " 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/hellaswag/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['activity_label']}: {data['question']} " 6 | processed_correct_answer = correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | return { 10 | "input": text, 11 | "output": correct_answer, 12 | "processed_output": processed_correct_answer, 13 | } 14 | -------------------------------------------------------------------------------- /datasets/humaneval/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | return {"input": data["prompt"].strip(), "output": "", "processed_output": ""} 6 | -------------------------------------------------------------------------------- /datasets/jecqa/config/jecqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "jecqa", 3 | "path": "datasets/jecqa/data/jecqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/jecqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/jecqa/config/jecqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "jecqa", 3 | "path": "datasets/jecqa/data/jecqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/jecqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/jecqa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | prefixes = ["A. ", "B. ", "C. ", "D. "] 6 | opt = "\n".join( 7 | [prefixes[i] + list(data["target_scores"].keys())[i] for i in range(4)] 8 | ) 9 | text = f"问题:{data['question']}\n选项:{opt}\n答案:" 10 | correct_answer = [ 11 | key for key, value in data["target_scores"].items() if value == 1 12 | ][0].strip() 13 | 14 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 15 | -------------------------------------------------------------------------------- /datasets/jecqa/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"问题:\n{data['question']}\n" 6 | answer_prompt = f"答案:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/lambada/config/lambada_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "lambada", 3 | "path": "datasets/lambada/data/lambada.jsonl", 4 | "description": "", 5 | "transform": "datasets/lambada/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/lambada/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | prompt = "Please complete the following sentence:\n" + data["question"] 6 | return { 7 | "input": prompt, 8 | "output": data["answer"], 9 | "processed_output": data["answer"], 10 | } 11 | -------------------------------------------------------------------------------- /datasets/lambada/transform_gen_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | instruction = ( 6 | f"Requirement:\nPlease complete the following context with a single word.\n" 7 | ) 8 | context = f"Context:\n{data['question']} " 9 | text = instruction + context 10 | processed_correct_answer = correct_answer = data["answer"] 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/math/config/algebra_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "algebra", 3 | "path": "datasets/math/data/algebra.jsonl", 4 | "description": "", 5 | "transform": "datasets/math/transform_gen_v0.py", 6 | "fewshot": 4, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_v1.json" 10 | }, 11 | "postprocess": "math_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/math/config/geometry_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "geometry", 3 | "path": "datasets/math/data/geometry.jsonl", 4 | "description": "", 5 | "transform": "datasets/math/transform_gen_v0.py", 6 | "fewshot": 4, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_v1.json" 10 | }, 11 | "postprocess": "math_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/math/config/number-theory_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "number-theory", 3 | "path": "datasets/math/data/number-theory.jsonl", 4 | "description": "", 5 | "transform": "datasets/math/transform_gen_v0.py", 6 | "fewshot": 4, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_v1.json" 10 | }, 11 | "postprocess": "math_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/math/config/prealgebra_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "prealgebra", 3 | "path": "datasets/math/data/prealgebra.jsonl", 4 | "description": "", 5 | "transform": "datasets/math/transform_gen_v0.py", 6 | "fewshot": 4, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_v1.json" 10 | }, 11 | "postprocess": "math_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/math/config/precalculus_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "precalculus", 3 | "path": "datasets/math/data/precalculus.jsonl", 4 | "description": "", 5 | "transform": "datasets/math/transform_gen_v0.py", 6 | "fewshot": 4, 7 | "generate": { 8 | "method": "generate", 9 | "params": "models/model_params/vllm_sample_v1.json" 10 | }, 11 | "postprocess": "math_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/mbpp-427/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def rand(n: int, r: random.Random): 5 | return int(r.random() * n) 6 | 7 | 8 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 9 | description = data["text"] 10 | tests = "\n".join(data["test_list"]) 11 | 12 | return { 13 | "input": f'"""{description}\n{tests}"""', 14 | "output": data["code"], 15 | "processed_output": data["code"], 16 | } 17 | -------------------------------------------------------------------------------- /datasets/mbpp/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def rand(n: int, r: random.Random): 5 | return int(r.random() * n) 6 | 7 | 8 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 9 | description = data["text"] 10 | tests = "\n".join(data["test_list"]) 11 | 12 | return { 13 | "input": f'"""{description}\n{tests}"""', 14 | "output": data["code"], 15 | "processed_output": data["code"], 16 | } 17 | -------------------------------------------------------------------------------- /datasets/medmcqa/config/medmcqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "medmcqa", 3 | "path": "datasets/medmcqa/data/medmcqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/medmcqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/medmcqa/config/medmcqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "medmcqa", 3 | "path": "datasets/medmcqa/data/medmcqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/medmcqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/medmcqa/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/medqa-mcmle/config/medqa-mcmle_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "medqa-mcmle", 3 | "path": "datasets/medqa-mcmle/data/medqa-mcmle.jsonl", 4 | "description": "", 5 | "transform": "datasets/medqa-mcmle/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/medqa-mcmle/config/medqa-mcmle_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "medqa-mcmle", 3 | "path": "datasets/medqa-mcmle/data/medqa-mcmle.jsonl", 4 | "description": "", 5 | "transform": "datasets/medqa-mcmle/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/medqa-mcmle/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"问题:\n{data['question']}\n" 6 | answer_prompt = f"答案:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/medqa-usmle/config/medqa-usmle_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "medqa-usmle", 3 | "path": "datasets/medqa-usmle/data/medqa-usmle.jsonl", 4 | "description": "", 5 | "transform": "datasets/medqa-usmle/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/medqa-usmle/config/medqa-usmle_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "medqa-usmle", 3 | "path": "datasets/medqa-usmle/data/medqa-usmle.jsonl", 4 | "description": "", 5 | "transform": "datasets/medqa-usmle/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/medqa-usmle/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/multirc/config/multirc_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "multirc", 3 | "path": "datasets/multirc/data/multirc.jsonl", 4 | "description": "", 5 | "transform": "datasets/multirc/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/multirc/config/multirc_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "multirc", 3 | "path": "datasets/multirc/data/multirc.jsonl", 4 | "description": "", 5 | "transform": "datasets/multirc/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/multirc/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage']}\nQuestion: {data['question'][0]}\nClaim: {data['question'][1]}\nIs it true?\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/multirc/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage']}\nQuestion: {data['question'][0]}\nClaim: {data['question'][1]}\nIs it true? " 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/nq-open/config/nq-open_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "nq-open", 3 | "path": "datasets/nq-open/data/nq-open.jsonl", 4 | "description": "", 5 | "transform": "datasets/nq-open/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/nq-open/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from UltraEval.tasks.postprocess import ExactMatchPost 4 | 5 | 6 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 7 | text = f"Question: {data['question']}\nAnswer: " 8 | correct_answer = data["answer"] 9 | emp = ExactMatchPost() 10 | _, processed_correct_answer = emp([], correct_answer) 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/ocnli-fc/config/ocnli-fc_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ocnli-fc", 3 | "path": "datasets/ocnli-fc/data/ocnli-fc.jsonl", 4 | "description": "", 5 | "transform": "datasets/ocnli-fc/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ocnli-fc/config/ocnli-fc_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ocnli-fc", 3 | "path": "datasets/ocnli-fc/data/ocnli-fc.jsonl", 4 | "description": "", 5 | "transform": "datasets/ocnli-fc/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ocnli-fc/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"阅读文章:{data['passage'][0]}\n根据上文,回答如下问题:{data['passage'][1]}\nA. 错\nB. 可能\nC. 对\n请从“A”,“B”,“C”中进行选择。\n答:" 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B", "C"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/ocnli-fc/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"阅读文章:{data['passage'][0]}\n根据上文,回答如下问题:{data['passage'][1]}\n请从“无关”,“蕴含”,“矛盾”中进行选择。\n答:" 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 10 | -------------------------------------------------------------------------------- /datasets/ocnli/config/ocnli_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ocnli", 3 | "path": "datasets/ocnli/data/ocnli.jsonl", 4 | "description": "", 5 | "transform": "datasets/ocnli/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ocnli/config/ocnli_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ocnli", 3 | "path": "datasets/ocnli/data/ocnli.jsonl", 4 | "description": "", 5 | "transform": "datasets/ocnli/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/ocnli/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"语句一:“{data['passage'][0]}”\n语句二:“{data['passage'][1]}”\n请问这两句话是什么关系?\nA. 矛盾\nB. 无关\nC. 蕴含\n请从“A”,“B”,“C”中进行选择。\n答:" 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B", "C"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/ocnli/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | "语句一:“" 7 | + data["passage"][0] 8 | + "”\n语句二:“" 9 | + data["passage"][1] 10 | + "”\n请问这两句话是什么关系?" 11 | ) 12 | correct_answer = [ 13 | key for key, value in data["target_scores"].items() if value == 1 14 | ][0].strip() 15 | 16 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 17 | -------------------------------------------------------------------------------- /datasets/openbookqa/config/openbookqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "openbookqa", 3 | "path": "datasets/openbookqa/data/openbookqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/openbookqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/openbookqa/config/openbookqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "openbookqa", 3 | "path": "datasets/openbookqa/data/openbookqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/openbookqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/piqa/config/piqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "piqa", 3 | "path": "datasets/piqa/data/piqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/piqa/transform_gen_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "qa_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/piqa/config/piqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "piqa", 3 | "path": "datasets/piqa/data/piqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/piqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/piqa/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | options = list(data["target_scores"].keys()) 6 | sol1, sol2 = options 7 | text = f"{data['question']}\nA. {sol1}\nB. {sol2}\nAnswer: " 8 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 9 | answers = ["A", "B"] 10 | correct_answer = answers[index_of_correct_answer] 11 | 12 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 13 | -------------------------------------------------------------------------------- /datasets/piqa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = "Question: " + data["question"] + " \n" 6 | text += "Answer: " 7 | correct_answer = [ 8 | key for key, value in data["target_scores"].items() if value == 1 9 | ][0].strip() 10 | 11 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 12 | -------------------------------------------------------------------------------- /datasets/piqa/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/quac/config/quac_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "quac", 3 | "path": "datasets/quac/data/quac.jsonl", 4 | "description": "", 5 | "transform": "datasets/quac/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "f1_score": { 14 | "evaluation": { 15 | "type": "f1_score" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/race/config/high_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "high", 3 | "path": "datasets/race/data/high.jsonl", 4 | "description": "", 5 | "transform": "datasets/race/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/race/config/high_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "high", 3 | "path": "datasets/race/data/high.jsonl", 4 | "description": "", 5 | "transform": "datasets/race/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/race/config/middle_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "middle", 3 | "path": "datasets/race/data/middle.jsonl", 4 | "description": "", 5 | "transform": "datasets/race/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/race/config/middle_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "middle", 3 | "path": "datasets/race/data/middle.jsonl", 4 | "description": "", 5 | "transform": "datasets/race/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/record/config/record_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "record", 3 | "path": "datasets/record/data/record.jsonl", 4 | "description": "", 5 | "transform": "datasets/record/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/rte/config/rte_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "rte", 3 | "path": "datasets/rte/data/rte.jsonl", 4 | "description": "", 5 | "transform": "datasets/rte/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/rte/config/rte_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "rte", 3 | "path": "datasets/rte/data/rte.jsonl", 4 | "description": "", 5 | "transform": "datasets/rte/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/rte/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['passage'][0]}\n{data['passage'][1]}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/rte/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = ( 6 | data["passage"][0] 7 | + "\n" 8 | + data["passage"][1] 9 | + "\nIs the sentence below entailed by the sentence above? " 10 | ) 11 | correct_answer = [ 12 | key for key, value in data["target_scores"].items() if value == 1 13 | ][0].strip() 14 | 15 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 16 | -------------------------------------------------------------------------------- /datasets/siqa/config/siqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "siqa", 3 | "path": "datasets/siqa/data/siqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/siqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/siqa/config/siqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "siqa", 3 | "path": "datasets/siqa/data/siqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/siqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/siqa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = "Based on the context: " + data["passage"] + "\n" 6 | text += "Question: " + data["question"] + "\n" 7 | text += "Answer: " 8 | 9 | correct_answer = [ 10 | key for key, value in data["target_scores"].items() if value == 1 11 | ][0].strip() 12 | 13 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 14 | -------------------------------------------------------------------------------- /datasets/squad/config/squad_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "squad", 3 | "path": "datasets/squad/data/squad.jsonl", 4 | "description": "", 5 | "transform": "datasets/squad/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/storycloze/config/storycloze_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "storycloze", 3 | "path": "datasets/storycloze/data/storycloze.jsonl", 4 | "description": "", 5 | "transform": "datasets/storycloze/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/storycloze/config/storycloze_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "storycloze", 3 | "path": "datasets/storycloze/data/storycloze.jsonl", 4 | "description": "", 5 | "transform": "datasets/storycloze/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/storycloze/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | passage = "\n".join(data["passage"]) + "\n" 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0] 9 | return { 10 | "input": passage, 11 | "output": correct_answer, 12 | "processed_output": correct_answer, 13 | } 14 | -------------------------------------------------------------------------------- /datasets/storycloze/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | passage = "\n".join(data["passage"]) 6 | text = f"{passage}\n" 7 | processed_correct_answer = correct_answer = [ 8 | key for key, value in data["target_scores"].items() if value == 1 9 | ][0] 10 | return { 11 | "input": text, 12 | "output": correct_answer, 13 | "processed_output": processed_correct_answer, 14 | } 15 | -------------------------------------------------------------------------------- /datasets/strategyqa/config/strategyqa_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "strategyqa", 3 | "path": "datasets/strategyqa/data/strategyqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/strategyqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/strategyqa/config/strategyqa_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "strategyqa", 3 | "path": "datasets/strategyqa/data/strategyqa.jsonl", 4 | "description": "", 5 | "transform": "datasets/strategyqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/strategyqa/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | facts_str = ", ".join(data["passage"][2]) 6 | 7 | text = f"Background: {data['passage'][0]} - {data['passage'][1]}\nFact: {facts_str}\nQuestion: {data['question']}\nA. Yes\nB. No\nAnswer: " 8 | correct_answer = "A" if data["target_scores"].get("Yes") == 1 else "B" 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/strategyqa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | facts_str = ", ".join(data["passage"][2]) 6 | 7 | text = f"Background: {data['passage'][0]} - {data['passage'][1]}\nFact: {facts_str}\nQuestion: {data['question']}\nAnswer: " 8 | correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ][0].strip() 11 | 12 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 13 | -------------------------------------------------------------------------------- /datasets/summedits/config/billsum_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "billsum", 3 | "path": "datasets/summedits/data/billsum.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/billsum_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "billsum", 3 | "path": "datasets/summedits/data/billsum.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/ectsum_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ectsum", 3 | "path": "datasets/summedits/data/ectsum.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/ectsum_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "ectsum", 3 | "path": "datasets/summedits/data/ectsum.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/news_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "news", 3 | "path": "datasets/summedits/data/news.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/news_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "news", 3 | "path": "datasets/summedits/data/news.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/podcast_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "podcast", 3 | "path": "datasets/summedits/data/podcast.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/podcast_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "podcast", 3 | "path": "datasets/summedits/data/podcast.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/qmsumm_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "qmsumm", 3 | "path": "datasets/summedits/data/qmsumm.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/qmsumm_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "qmsumm", 3 | "path": "datasets/summedits/data/qmsumm.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/sales-call_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sales-call", 3 | "path": "datasets/summedits/data/sales-call.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/sales-call_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sales-call", 3 | "path": "datasets/summedits/data/sales-call.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/sales-email_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sales-email", 3 | "path": "datasets/summedits/data/sales-email.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/sales-email_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "sales-email", 3 | "path": "datasets/summedits/data/sales-email.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/samsum_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "samsum", 3 | "path": "datasets/summedits/data/samsum.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/samsum_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "samsum", 3 | "path": "datasets/summedits/data/samsum.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/scitldr_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "scitldr", 3 | "path": "datasets/summedits/data/scitldr.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/scitldr_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "scitldr", 3 | "path": "datasets/summedits/data/scitldr.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/shakespeare_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "shakespeare", 3 | "path": "datasets/summedits/data/shakespeare.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/config/shakespeare_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "shakespeare", 3 | "path": "datasets/summedits/data/shakespeare.jsonl", 4 | "description": "", 5 | "transform": "datasets/summedits/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/summedits/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"\nDocument:\n{data['passage'][0]}\nSummary:\n{data['passage'][1]}\nIs the summary factually consistent with the document? " 6 | correct_answer = [ 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ][0].strip() 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/theoremqa/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from UltraEval.tasks.postprocess import TheoremQAPost 4 | 5 | 6 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 7 | text = f"\nQuestion: {data['question']}\nAnswer: " 8 | correct_answer = str(data["answer"][1]) 9 | tqap = TheoremQAPost() 10 | _, processed_correct_answer = tqap([], correct_answer) 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/tnews/config/tnews_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "tnews", 3 | "path": "datasets/tnews/data/tnews.jsonl", 4 | "description": "", 5 | "transform": "datasets/tnews/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/tnews/config/tnews_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "tnews", 3 | "path": "datasets/tnews/data/tnews.jsonl", 4 | "description": "", 5 | "transform": "datasets/tnews/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/tnews/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"{data['question']}\n上述内容属于什么新闻?" 6 | 7 | correct_answer = [ 8 | key for key, value in data["target_scores"].items() if value == 1 9 | ][0].strip() 10 | 11 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 12 | -------------------------------------------------------------------------------- /datasets/triviaqa/config/web_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "web", 3 | "path": "datasets/triviaqa/data/web.jsonl", 4 | "description": "", 5 | "transform": "datasets/triviaqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/triviaqa/config/wikipedia_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "wikipedia", 3 | "path": "datasets/triviaqa/data/wikipedia.jsonl", 4 | "description": "", 5 | "transform": "datasets/triviaqa/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "exact_match_post", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "exact_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/truthfulqa/config/mc1_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "mc1", 3 | "path": "datasets/truthfulqa/data/mc1.jsonl", 4 | "description": "", 5 | "transform": "datasets/truthfulqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/truthfulqa/config/mc2_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "mc2", 3 | "path": "datasets/truthfulqa/data/mc2.jsonl", 4 | "description": "", 5 | "transform": "datasets/truthfulqa/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob_mc2" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/truthfulqa/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | correct_answer = [key for key, value in data["target_scores"].items() if value == 1] 6 | text = f"Question: {data['question']}\nAnswer: " 7 | 8 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 9 | -------------------------------------------------------------------------------- /datasets/truthfulqa/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"Question:\n{data['question']}\n" 6 | answer_prompt = f"Answer:\n" 7 | text = question + answer_prompt 8 | processed_correct_answer = correct_answer = [ 9 | key for key, value in data["target_scores"].items() if value == 1 10 | ] 11 | 12 | return { 13 | "input": text, 14 | "output": correct_answer, 15 | "processed_output": processed_correct_answer, 16 | } 17 | -------------------------------------------------------------------------------- /datasets/wic/config/wic_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "wic", 3 | "path": "datasets/wic/data/wic.jsonl", 4 | "description": "", 5 | "transform": "datasets/wic/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/wic/config/wic_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "wic", 3 | "path": "datasets/wic/data/wic.jsonl", 4 | "description": "", 5 | "transform": "datasets/wic/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/wic/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"Sentence 1: {data['passage'][0]}\nSentence 2: {data['passage'][1]}\nAre '{data['question']}' in the above two sentenses the same?\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /datasets/winogender/config/female_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "female", 3 | "path": "datasets/winogender/data/female.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogender/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogender/config/female_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "female", 3 | "path": "datasets/winogender/data/female.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogender/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogender/config/male_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "male", 3 | "path": "datasets/winogender/data/male.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogender/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogender/config/male_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "male", 3 | "path": "datasets/winogender/data/male.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogender/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogender/config/neutral_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "neutral", 3 | "path": "datasets/winogender/data/neutral.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogender/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogender/config/neutral_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "neutral", 3 | "path": "datasets/winogender/data/neutral.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogender/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogender/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = data["passage"] + ' "' + data["question"].capitalize() + '" refers to ' 6 | output_sentence = next( 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ) 9 | return { 10 | "input": text, 11 | "output": output_sentence, 12 | "processed_output": output_sentence, 13 | } 14 | -------------------------------------------------------------------------------- /datasets/winogrande/config/winogrande_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "winogrande", 3 | "path": "datasets/winogrande/data/winogrande.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogrande/transform_gen_v1.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "qa_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogrande/config/winogrande_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "winogrande", 3 | "path": "datasets/winogrande/data/winogrande.jsonl", 4 | "description": "", 5 | "transform": "datasets/winogrande/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/winogrande/transform_ppl_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | output_sentence = next( 6 | key for key, value in data["target_scores"].items() if value == 1 7 | ) 8 | return { 9 | "input": data["question"], 10 | "output": output_sentence, 11 | "processed_output": output_sentence, 12 | } 13 | -------------------------------------------------------------------------------- /datasets/winogrande/transform_ppl_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = data["question"] 6 | processed_correct_answer = correct_answer = next( 7 | key for key, value in data["target_scores"].items() if value == 1 8 | ) 9 | return { 10 | "input": text, 11 | "output": correct_answer, 12 | "processed_output": processed_correct_answer, 13 | } 14 | -------------------------------------------------------------------------------- /datasets/wmt20-en-zh/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | ans = data["answer"] + "\n" 6 | prompt = f"请将下面这段内容从英文翻译为中文:\n{data['question']}\n译文:\n" 7 | return {"input": prompt, "output": ans, "processed_output": ans} 8 | -------------------------------------------------------------------------------- /datasets/wmt20-en-zh/transform_gen_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"问题:\n如何将下面这句话从英文翻译为中文?\n" 6 | context = f"{data['question']}\n" 7 | answer_prompt = f"答案:\n" 8 | text = question + context + answer_prompt 9 | processed_correct_answer = correct_answer = data["answer"] 10 | 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/wmt20-zh-en/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | ans = data["answer"] + "\n" 6 | prompt = f"请将下面这段内容从中文翻译为英文:\n{data['question']}\n译文:\n" 7 | return {"input": prompt, "output": ans, "processed_output": ans} 8 | -------------------------------------------------------------------------------- /datasets/wmt20-zh-en/transform_gen_v1.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | question = f"问题:\n如何将下面这句话从中文翻译为英文?\n" 6 | context = f"{data['question']}\n" 7 | answer_prompt = f"答案:\n" 8 | text = question + context + answer_prompt 9 | processed_correct_answer = correct_answer = data["answer"] 10 | 11 | return { 12 | "input": text, 13 | "output": correct_answer, 14 | "processed_output": processed_correct_answer, 15 | } 16 | -------------------------------------------------------------------------------- /datasets/wsc/config/wsc_gen.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "wsc", 3 | "path": "datasets/wsc/data/wsc.jsonl", 4 | "description": "", 5 | "transform": "datasets/wsc/transform_gen_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "generate", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "prefix_match" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/wsc/config/wsc_ppl.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_name": "wsc", 3 | "path": "datasets/wsc/data/wsc.jsonl", 4 | "description": "", 5 | "transform": "datasets/wsc/transform_ppl_v0.py", 6 | "fewshot": 0, 7 | "generate": { 8 | "method": "loglikelihood", 9 | "params": "" 10 | }, 11 | "postprocess": "", 12 | "metric": { 13 | "accuracy": { 14 | "evaluation": { 15 | "type": "log_prob" 16 | } 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /datasets/wsc/transform_gen_v0.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str): 5 | text = f"Passage: {data['passage']}\nDoes the pronoun # {data['question'][1]} # refer to * {data['question'][0]} *?\nA. Yes\nB. No\nAnswer: " 6 | index_of_correct_answer = list(data["target_scores"].values()).index(1) 7 | answers = ["A", "B"] 8 | correct_answer = answers[index_of_correct_answer] 9 | 10 | return {"input": text, "output": correct_answer, "processed_output": correct_answer} 11 | -------------------------------------------------------------------------------- /docs/pics/ultraeval_logo_white.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/docs/pics/ultraeval_logo_white.jpg -------------------------------------------------------------------------------- /docs/pics/ultraeval_pipeline_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/docs/pics/ultraeval_pipeline_white.png -------------------------------------------------------------------------------- /docs/tutorials/en/customization/new_metric.md: -------------------------------------------------------------------------------- 1 | ## Adding a New Evaluation Method 2 | 3 | - **Writing a Metric File** 4 | - In the `metrics/` directory, create a new Python file for your evaluation metric, naming it `metrics/{metric_name}.py`. 5 | - Refer to the [Metric Introduction](../configuration_file/metric.md) tutorial to complete writing the corresponding MetricName class in `metrics/{metric_name}.py`. 6 | - Register it in `metrics/__init__.py` by adding `from .metric_name import MetricName` and registering your custom evaluation metric class in the `METRICS_REGISTRY` dictionary. -------------------------------------------------------------------------------- /docs/tutorials/en/customization/new_postprocess.md: -------------------------------------------------------------------------------- 1 | ## Adding a New Post-Processing Method 2 | 3 | - **Adding a Postprocess Class** 4 | - In `tasks/postprocess.py`, following the [Postprocess Introduction](../configuration_file/postprocess.md) tutorial, add your custom post-processing class. 5 | - Register your custom post-processing class in the `POSTPROCESS_REGISTRY` dictionary located in `tasks/postprocess.py`. -------------------------------------------------------------------------------- /docs/tutorials/zh/customization/new_metric.md: -------------------------------------------------------------------------------- 1 | ## 添加新的评测方法 2 | 3 | 4 | - 编写metric文件 5 | - 在`metrics/`目录下,为你的评测指标创建一个新的Python文件,命名为`metrics/{metric_name}.py`。 6 | - 参照[【metric介绍】](../configuration_file/metric.md)教程,在 `metrics/{metric_name}.py` 中完成相应MetricName类的编写。 7 | - 在`metrics/__init__.py`中进行注册,包括添加`from .metric_name import MetricName`,并在`METRICS_REGISTRY`字典中注册你的自定义评测指标类。 -------------------------------------------------------------------------------- /docs/tutorials/zh/customization/new_postprocess.md: -------------------------------------------------------------------------------- 1 | ## 新的后处理方法 2 | 3 | - 添加postprocess类 4 | - 在`tasks/postprocess.py`中,参照[【 postprocess介绍 】](../configuration_file/postprocess.md)教程,添加自定义后处理类。 5 | - 在`tasks/postprocess.py`中的`POSTPROCESS_REGISTRY`字典中注册你的自定义后处理类。 6 | -------------------------------------------------------------------------------- /docs/tutorials/zh/deployment_model/model_download.md: -------------------------------------------------------------------------------- 1 | ## 下载模型 2 | 3 | 4 | 首次部署某模型时,需要从 HuggingFace 下载模型,根据模型大小,此过程可能需花费 10 分钟至 1 小时。其中下载前需要: 5 | 6 | 1.**登录 Huggingface CLI**: 7 | 8 | 输入以下命令并登录: 9 | 10 | ``` 11 | huggingface-cli login 12 | ``` 13 | 14 | 2.**输入您的 Token**: 15 | 16 | 登录时,输入 Huggingface 上的 Token。 17 | 18 | 3.**下载模型**: 19 | 20 | 在这里既可以git clone对应的模型,也可以通过之后的[单卡部署](./deployment.md)进行下载。 -------------------------------------------------------------------------------- /docs/tutorials/zh/evaluation/model_instantiation.md: -------------------------------------------------------------------------------- 1 | ## 模型实例化 2 | 3 | 4 | 模型部署成功后,会产生一个URL。首先需要进行实例化。我们在`models`目录下提供了两类脚本,`general_model.py`对应URL实例化, `openai_model.py`对应API服务。其中,定义了`Model`类和请求转接方法`_post_request`。 5 | 6 | - `Model`类包含了初始化、`loglikelihood`和`generate`三个方法。 7 | - 初始化使用main文件中传入的model_args,实例化model。 8 | - `loglikelihood`和`generate`分别和模型推理的方式是对应的。如果用户评测自己训练的模型,则需要在推理代码中,实现这两部分对应的功能。 9 | - 在评测过程中,所有的数据会传送到`Model`类中,根据推理方式不同,交给`loglikelihood`或`generate`函数,然后提取传入的`params`和`instance`,通过`_post_request`方法将包装后的数据给到模型进行推理。最后将模型生成的结果,返回到评测过程中,进行后处理。 10 | 11 | -------------------------------------------------------------------------------- /docs/tutorials/zh/ultraeval.md: -------------------------------------------------------------------------------- 1 | UltraEval是一个开源的基础模型能力评测框架,提供了一套轻量级、易于使用的评测体系。整体框架组织如下图所示: 2 | 3 |
4 |

5 | 6 |

7 |
8 | 9 | 按照操作顺序,共分为【数据准备】、【模型部署】、和【任务评测】三大模块,分别对应 10 | 11 | * [【配置文件】](./configuration_file/config.md) 12 | * [【模型部署】](./deployment_model/model_download.md) 13 | * [【任务评测】](./evaluation/model_instantiation.md) 14 | 15 | 此外UltraEval具有很好的扩展性,为了便于用户扩展其他任务或者模型,我们提供了定制化评测流程。 16 | * [【用户个性化设置教程】](./customization/new_dataset.md) 17 | 18 | 19 | -------------------------------------------------------------------------------- /metrics/in_match.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | 4 | class InMatch: 5 | def __init__( 6 | self, 7 | ): 8 | pass 9 | 10 | def __call__(self, doc, ground_truth, results) -> Any: 11 | if isinstance(ground_truth, str): 12 | ground_truth = [ground_truth] 13 | 14 | return 1.0 if results[0].lower().strip() in ground_truth else 0.0 15 | -------------------------------------------------------------------------------- /metrics/prefix_match.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | 4 | class PrefixMatch: 5 | def __init__( 6 | self, 7 | ): 8 | pass 9 | 10 | def __call__(self, doc, ground_truth, results) -> Any: 11 | """Take a single document and the LM input/output/ground_truth. 12 | Returns the values of the metric for that one document 13 | """ 14 | return 1.0 if results[0].strip().startswith(ground_truth.strip()) else 0.0 15 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import general_model, openai_model 2 | 3 | MODEL_REGISTRY = { 4 | "gpt3_5": openai_model.GPT3_5, 5 | "gpt-3.5-turbo": openai_model.GPT3_5, 6 | "gpt4": openai_model.GPT4, 7 | "gpt-4": openai_model.GPT4, 8 | "general": general_model.GeneralModel, 9 | } 10 | 11 | 12 | def get_model(model_name): 13 | return MODEL_REGISTRY[model_name] 14 | -------------------------------------------------------------------------------- /models/model_params/gpt-3.5-turbo.json: -------------------------------------------------------------------------------- 1 | { 2 | "temperature": 0.0, 3 | "stop": null, 4 | "max_tokens": 4097, 5 | "request_id": null, 6 | "top_p": 1, 7 | "presence_penalty": 0, 8 | "frequency_penalty": 0, 9 | "sampling_num": 1 10 | } -------------------------------------------------------------------------------- /models/model_params/gpt-4.json: -------------------------------------------------------------------------------- 1 | { 2 | "temperature": 0.0, 3 | "stop": null, 4 | "max_tokens": 1600, 5 | "request_id": null, 6 | "top_p": 1, 7 | "presence_penalty": 0, 8 | "frequency_penalty": 0, 9 | "sampling_num": 1 10 | } -------------------------------------------------------------------------------- /models/model_params/vllm_beamsearch.json: -------------------------------------------------------------------------------- 1 | { 2 | "use_beam_search": true, 3 | "best_of": 10, 4 | "temperature": 0, 5 | "top_p": 1, 6 | "top_k": -1, 7 | "early_stopping": "never", 8 | "sampling_num": 1 9 | } -------------------------------------------------------------------------------- /models/model_params/vllm_logprobs.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt_logprobs": 0, 3 | "max_tokens": 1 4 | } -------------------------------------------------------------------------------- /models/model_params/vllm_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "temperature": 0.3, 3 | "top_p": 0.8, 4 | "max_tokens": 300, 5 | "sampling_num": 1 6 | } -------------------------------------------------------------------------------- /models/model_params/vllm_sample_bbh.json: -------------------------------------------------------------------------------- 1 | { 2 | "temperature": 0.3, 3 | "top_p": 0.8, 4 | "max_tokens": 1024, 5 | "sampling_num": 1, 6 | "presence_penalty": 0.0 7 | } -------------------------------------------------------------------------------- /models/model_params/vllm_sample_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "temperature": 0.1, 3 | "top_p": 0.95, 4 | "max_tokens": 300, 5 | "sampling_num": 1 6 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | vllm 2 | flask 3 | openai 4 | sacrebleu 5 | rouge_chinese 6 | pytablewriter 7 | gevent 8 | gunicorn 9 | tqdm 10 | pynvml 11 | accelerate>=0.20.3 -------------------------------------------------------------------------------- /run_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bash scripts/run_job_base.sh mistralai/Mistral-7B-v0.1 1 1 logs/mistralai/Mistral-7B-v0.1 bbh,mmlu,ceval,cmmlu,humaneval,mbpp-427,gsm8k,math,hellaswag,boolq,piqa,winogrande,arc-e,arc-c gen -1 4 | sleep 600 5 | 6 | bash scripts/run_job_base.sh mistralai/Mistral-7B-v0.1 1 1 logs/mistralai/Mistral-7B-v0.1B hellaswag,boolq,piqa,winogrande,arc-e,arc-c ppl -1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("requirements.txt", 'r', encoding='utf-8') as f: 4 | requirements = f.read().strip().splitlines() 5 | 6 | setuptools.setup( 7 | name="UltraEval", 8 | version="0.1", 9 | author="UltraEval Team", 10 | author_email="", 11 | description="An open source framework for evaluating foundation models", 12 | packages=setuptools.find_packages(), 13 | python_requires=">=3.10", 14 | install_requires=requirements, 15 | ) 16 | -------------------------------------------------------------------------------- /tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/tasks/__init__.py -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/utils/__init__.py -------------------------------------------------------------------------------- /utils/request.py: -------------------------------------------------------------------------------- 1 | REQUEST_RETURN_LENGTHS = { 2 | "loglikelihood": 1, 3 | "generate": 1, 4 | } 5 | 6 | 7 | class Request: 8 | def __init__(self, request_type, instances, params, raw_example): 9 | if request_type not in REQUEST_RETURN_LENGTHS.keys(): 10 | raise NotImplementedError( 11 | "The request type {} is not implemented!".format(request_type) 12 | ) 13 | 14 | self.request_type = request_type 15 | self.instances = instances 16 | self.params = params 17 | self.raw_example = raw_example 18 | --------------------------------------------------------------------------------